diff --git a/.gitignore b/.gitignore
index 801790d0a4720..664c45b7202f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -52,12 +52,12 @@ tools/__pycache__
 
 # This file is automatically generated.
 # TODO(zhiqiang) Move this file to build directory.
-paddle/infrt/dialect/pd_ops.td
+paddle/infrt/dialect/pd/ir/pd_ops.td
 paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td
 paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td
 tools/infrt/kernels.json
 tools/infrt/kernel_signature.json
-paddle/infrt/dialect/pd_ops_info.h
+paddle/infrt/dialect/pd/common/pd_ops_info.h
 .lit_test_times.txt
 paddle/infrt/tests/dialect/Output
 paddle/infrt/tests/lit.cfg.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c5f711d2918b..6988434996bcc 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,6 +53,7 @@ option(WITH_IPU         "Compile PaddlePaddle with Graphcore IPU"    OFF)
 # to develop some acl related functionality on x86
 option(WITH_ASCEND_CL         "Compile PaddlePaddle with ASCEND CL"        ${WITH_ASCEND})
 option(WITH_ASCEND_CXX11         "Compile PaddlePaddle with ASCEND and CXX11 ABI"        OFF)
+option(WITH_ONNXRUNTIME         "Compile PaddlePaddle with ONNXRUNTIME"          OFF)
 # Note(zhouwei): It use option above, so put here
 include(init)
 include(generic)            # simplify cmake module
diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake
index 41b90345c8c5f..d3f330ba9dd0f 100644
--- a/cmake/external/cinn.cmake
+++ b/cmake/external/cinn.cmake
@@ -26,7 +26,7 @@ add_definitions(-w)
 ######################################
 include(ExternalProject)
 set(CINN_PREFIX_DIR ${THIRD_PARTY_PATH}/CINN)
-set(CINN_GIT_TAG release/v0.1)
+set(CINN_GIT_TAG 56879b637e2c4db19091eedad03d7cc674e092a2)
 set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION}
                        -DWITH_CUDA=${WITH_GPU}
                        -DWITH_CUDNN=${WITH_GPU}
diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake
index 9f6fd32ad986c..5c48afa2806aa 100644
--- a/cmake/external/llvm.cmake
+++ b/cmake/external/llvm.cmake
@@ -99,7 +99,8 @@ endfunction()
 
 function(mlir_add_rewriter td_base)
   set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
-  mlir_tablegen(${td_base}.cpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass")
+  set(LLVM_TARGET_DEPENDS  ${LLVM_TARGET_DEPENDS} ${CMAKE_SOURCE_DIR}/paddle/infrt/dialect/infrt/ir/infrt_base.td)
+  mlir_tablegen(${td_base}.cpp.inc -gen-rewriters)
   add_public_tablegen_target(MLIR${td_base}IncGen)
   add_dependencies(mlir-headers MLIR${td_base}IncGen)
 endfunction()
diff --git a/cmake/external/onnxruntime.cmake b/cmake/external/onnxruntime.cmake
new file mode 100644
index 0000000000000..2162f87812d13
--- /dev/null
+++ b/cmake/external/onnxruntime.cmake
@@ -0,0 +1,94 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if (NOT WITH_ONNXRUNTIME)
+  return()
+endif ()
+
+if (WITH_ARM)
+  message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu")
+  return()
+endif ()
+
+INCLUDE(ExternalProject)
+
+add_definitions(-DPADDLE_WITH_ONNXRUNTIME)
+
+SET(ONNXRUNTIME_PROJECT        "extern_onnxruntime")
+SET(ONNXRUNTIME_PREFIX_DIR     ${THIRD_PARTY_PATH}/onnxruntime)
+SET(ONNXRUNTIME_SOURCE_DIR     ${THIRD_PARTY_PATH}/onnxruntime/src/${ONNXRUNTIME_PROJECT})
+SET(ONNXRUNTIME_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/onnxruntime)
+SET(ONNXRUNTIME_INC_DIR        "${ONNXRUNTIME_INSTALL_DIR}/include" CACHE PATH "onnxruntime include directory." FORCE)
+SET(ONNXRUNTIME_LIB_DIR        "${ONNXRUNTIME_INSTALL_DIR}/lib" CACHE PATH "onnxruntime lib directory." FORCE)
+SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${ONNXRUNTIME_LIB_DIR}")
+
+
+if (WIN32)
+  SET(ONNXRUNTIME_URL             "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-win-x64-1.10.0.zip")
+elseif (APPLE)
+  SET(ONNXRUNTIME_URL           "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-osx-x86_64-1.10.0.tgz")
+else ()
+  SET(ONNXRUNTIME_URL             "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-linux-x64-1.10.0.tgz")
+endif()
+
+
+INCLUDE_DIRECTORIES(${ONNXRUNTIME_INC_DIR}) # For ONNXRUNTIME code to include internal headers.
+if (WIN32)
+  SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
+  SET(ONNXRUNTIME_SHARED_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
+  SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.lib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
+elseif (APPLE)
+  SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
+  SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
+  SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
+else ()
+  SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
+  SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
+  SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
+endif ()
+
+if (WIN32)
+  ExternalProject_Add(
+      ${ONNXRUNTIME_PROJECT}
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      URL                 ${ONNXRUNTIME_URL}
+      PREFIX              ${ONNXRUNTIME_PREFIX_DIR}
+      DOWNLOAD_NO_PROGRESS  1
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         ""
+      UPDATE_COMMAND        ""
+      INSTALL_COMMAND       ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_SHARED_LIB} &&
+                            ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.lib ${ONNXRUNTIME_LIB} &&
+                            ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR}
+      BUILD_BYPRODUCTS      ${ONNXRUNTIME_LIB}
+  )
+else ()
+  ExternalProject_Add(
+    ${ONNXRUNTIME_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    URL                 ${ONNXRUNTIME_URL}
+    PREFIX              ${ONNXRUNTIME_PREFIX_DIR}
+    DOWNLOAD_NO_PROGRESS  1
+    CONFIGURE_COMMAND     ""
+    BUILD_COMMAND         ""
+    UPDATE_COMMAND        ""
+    INSTALL_COMMAND       ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_LIB} &&
+                          ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR}
+    BUILD_BYPRODUCTS      ${ONNXRUNTIME_LIB}
+  )
+endif()
+
+ADD_LIBRARY(onnxruntime STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${ONNXRUNTIME_LIB})
+ADD_DEPENDENCIES(onnxruntime ${ONNXRUNTIME_PROJECT})
diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake
new file mode 100644
index 0000000000000..ba6f0396008fc
--- /dev/null
+++ b/cmake/external/paddle2onnx.cmake
@@ -0,0 +1,97 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT WITH_ONNXRUNTIME)
+  return()
+endif()
+
+if (WITH_ARM)
+  message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu")
+  return()
+endif ()
+
+INCLUDE(ExternalProject)
+
+SET(PADDLE2ONNX_PROJECT        "extern_paddle2onnx")
+SET(PADDLE2ONNX_PREFIX_DIR     ${THIRD_PARTY_PATH}/paddle2onnx)
+SET(PADDLE2ONNX_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/paddle2onnx)
+SET(PADDLE2ONNX_INC_DIR        "${PADDLE2ONNX_INSTALL_DIR}/include" CACHE PATH "paddle2onnx include directory." FORCE)
+SET(PADDLE2ONNX_REPOSITORY     ${GIT_URL}/PaddlePaddle/Paddle2ONNX.git)
+SET(PADDLE2ONNX_TAG            cpp)
+SET(LIBDIR "lib")
+SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}")
+
+INCLUDE_DIRECTORIES(${PADDLE2ONNX_INC_DIR}) # For PADDLE2ONNX code to include internal headers.
+if(WIN32)
+    SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.lib" CACHE FILEPATH "paddle2onnx static library." FORCE)
+    SET(PADDLE2ONNX_SHARED_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.dll" CACHE FILEPATH "paddle2onnx shared library." FORCE)
+elseif(APPLE)
+    SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.dylib" CACHE FILEPATH "PADDLE2ONNX library." FORCE)
+else()
+    SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.so" CACHE FILEPATH "PADDLE2ONNX library." FORCE)
+endif(WIN32)
+
+
+# The protoc path is required to compile onnx.
+string(REPLACE "/" ";" PROTOC_BIN_PATH ${PROTOBUF_PROTOC_EXECUTABLE})
+list(POP_BACK PROTOC_BIN_PATH)
+list(JOIN PROTOC_BIN_PATH "/" PROTOC_BIN_PATH)
+
+
+set(PADDLE2ONNX_OPTIONAL_ARGS
+      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+      -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+      -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+      -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+      -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+      -DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH}
+      -DWITH_STATIC=OFF
+      -DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR}
+      -DCMAKE_INSTALL_LIBDIR=${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}
+      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+      -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+      ${EXTERNAL_OPTIONAL_ARGS}
+)
+
+if (WITH_PYTHON)
+  set(PADDLE2ONNX_OPTIONAL_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS}
+    -DPYTHON_EXECUTABLE:FILEPATH=${PYTHON_EXECUTABLE}
+    -DPYTHON_INCLUDE_DIR:PATH=${PYTHON_INCLUDE_DIR}
+    -DPYTHON_LIBRARY:FILEPATH=${PYTHON_LIBRARY}
+  )
+endif ()
+
+
+ExternalProject_Add(
+    ${PADDLE2ONNX_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    ${SHALLOW_CLONE}
+    GIT_REPOSITORY      ${PADDLE2ONNX_REPOSITORY}
+    GIT_TAG             ${PADDLE2ONNX_TAG}
+    DEPENDS             protobuf
+    PREFIX              ${PADDLE2ONNX_PREFIX_DIR}
+    UPDATE_COMMAND      ""
+    CMAKE_ARGS       ${PADDLE2ONNX_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PADDLE2ONNX_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    BUILD_BYPRODUCTS    ${PADDLE2ONNX_LIB}
+)
+
+ADD_LIBRARY(paddle2onnx STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE2ONNX_LIB})
+ADD_DEPENDENCIES(paddle2onnx ${PADDLE2ONNX_PROJECT})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index f7cb7716969f5..58ff5f0d2b715 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -198,7 +198,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}")
     ENDIF()
 
-    if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+
+    if(WITH_ONNXRUNTIME)
+        SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
+        SET(PROTOBUF_TAG         v3.18.0)
+    elseif(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
         SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
         SET(PROTOBUF_TAG         v3.8.0)
     elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
@@ -248,7 +252,9 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     )
 ENDFUNCTION()
 
-if(WITH_ASCEND OR WITH_ASCEND_CL)
+if(WITH_ONNXRUNTIME)
+    SET(PROTOBUF_VERSION 3.18.0)
+elseif(WITH_ASCEND OR WITH_ASCEND_CL)
     SET(PROTOBUF_VERSION 3.8.0)
 elseif(WITH_IPU)
     SET(PROTOBUF_VERSION 3.6.1)
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 45a76fdc1f1a2..cfbe68eecbaca 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220228")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220307")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index c48d31f7e4f90..851bd81403a85 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -114,6 +114,24 @@ function(copy_part_of_thrid_party TARGET DST)
         endif()
     endif()
 
+    if (WITH_ONNXRUNTIME)
+        set(dst_dir "${DST}/third_party/install/onnxruntime")
+        copy(${TARGET}
+                SRCS ${ONNXRUNTIME_INC_DIR} ${ONNXRUNTIME_LIB_DIR}
+                DSTS ${dst_dir} ${dst_dir})
+
+        set(dst_dir "${DST}/third_party/install/paddle2onnx")
+        if(WIN32)
+            copy(${TARGET}
+                SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_SHARED_LIB} ${PADDLE2ONNX_LIB}
+                DSTS ${dst_dir}/include ${dst_dir}/lib ${dst_dir}/lib)
+        else()
+            copy(${TARGET}
+                SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_LIB}
+                DSTS ${dst_dir}/include ${dst_dir}/lib)
+        endif()
+    endif()
+
     set(dst_dir "${DST}/third_party/install/gflags")
     copy(${TARGET}
             SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index ac3eff04d5383..7df095c6c2ec0 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -250,6 +250,12 @@ IF(WITH_TESTING OR WITH_DISTRIBUTE)
     list(APPEND third_party_deps extern_gtest)
 ENDIF()
 
+if(WITH_ONNXRUNTIME)
+    include(external/onnxruntime)            # download, build, install onnxruntime、paddle2onnx
+    include(external/paddle2onnx)          
+    list(APPEND third_party_deps extern_onnxruntime extern_paddle2onnx)
+endif()
+
 if(WITH_GPU)
     if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
         include(external/cub)       # download cub
diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index f88c993d85e2f..49ba9479d49e9 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -1,8 +1,9 @@
 cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
+cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup phi phi_api string_helper)
+
 if (WITH_DISTRIBUTE)
   cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper)
 endif()
-cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
 
 if(WITH_NCCL)
     cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
index 5dc43af117825..cb82677a281e9 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
@@ -171,10 +171,10 @@ ProcessGroupGloo::GlooTask::GlooTask(int rank,
                         "Only CPU place is supported for ProcessGroupGloo."));
 }
 
-ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr<GlooStore>& store,
-                                   int rank, int world_size,
-                                   const std::shared_ptr<GlooOptions> options)
-    : ProcessGroup(rank, world_size), _tag(0), _store(store) {
+ProcessGroupGloo::ProcessGroupGloo(
+    const std::shared_ptr<paddle::distributed::Store>& store, int rank,
+    int world_size, const std::shared_ptr<GlooOptions> options)
+    : ProcessGroup(rank, world_size), _tag(0), _store(new GlooStore(store)) {
   _context = std::make_shared<gloo::rendezvous::Context>(rank, world_size);
   auto prefix_store =
       ::gloo::rendezvous::PrefixStore(std::to_string(0), *_store);
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
index 24f156571a427..71e0a40f8a761 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
@@ -52,8 +52,7 @@ class ProcessGroupGloo : public ProcessGroup {
 
   class GlooStore : public ::gloo::rendezvous::Store {
    public:
-    explicit GlooStore(
-        const std::shared_ptr<paddle::distributed::TCPStore>& store)
+    explicit GlooStore(const std::shared_ptr<paddle::distributed::Store>& store)
         : _store(store) {}
 
     ~GlooStore() = default;
@@ -87,7 +86,7 @@ class ProcessGroupGloo : public ProcessGroup {
     }
 
    protected:
-    std::shared_ptr<paddle::distributed::TCPStore> _store;
+    std::shared_ptr<paddle::distributed::Store> _store;
   };
 
   class GlooOptions {
@@ -100,9 +99,9 @@ class ProcessGroupGloo : public ProcessGroup {
     std::shared_ptr<::gloo::transport::Device> device;
   };
 
-  explicit ProcessGroupGloo(const std::shared_ptr<GlooStore>& store, int rank,
-                            int world_size,
-                            std::shared_ptr<GlooOptions> options);
+  explicit ProcessGroupGloo(
+      const std::shared_ptr<paddle::distributed::Store>& store, int rank,
+      int world_size, std::shared_ptr<GlooOptions> options);
 
   ~ProcessGroupGloo() = default;
 
@@ -145,7 +144,7 @@ class ProcessGroupGloo : public ProcessGroup {
  protected:
   uint32_t _tag;
   std::shared_ptr<gloo::rendezvous::Context> _context;
-  std::shared_ptr<GlooStore> _store;
+  std::shared_ptr<::gloo::rendezvous::Store> _store;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
index 84f5ca48d25c8..2deeb7ca03003 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
@@ -139,11 +139,9 @@ bool ProcessGroupHCCL::HCCLTask::IsCompleted() {
 // TODO(sandyhouse): Add timeout for wait, now timeout unused
 bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) {
   SynchronizeStreams();
-  if (FLAGS_hccl_blocking_wait) {
-    // NOTE(sandyhouse): It will block host for sync
-    while (!IsCompleted()) {
-      std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
-    }
+  // NOTE(sandyhouse): It will block host for sync
+  while (!IsCompleted()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
   }
   return true;
 }
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
index f2376b4eed760..83d509be2cdd7 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
@@ -84,29 +84,6 @@ class ProcessGroupHCCL : public ProcessGroup {
       std::vector<Tensor>& tensors,
       const BroadcastOptions& = BroadcastOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Task> Barrier(
-      const BarrierOptions& = BarrierOptions()) override;
-
-  std::shared_ptr<ProcessGroup::Task> Send(std::vector<Tensor>& tensors,
-                                           int dst_rank) override;
-
-  std::shared_ptr<ProcessGroup::Task> Recv(std::vector<Tensor>& tensors,
-                                           int src_rank) override;
-
-  std::shared_ptr<ProcessGroup::Task> AllGather(
-      std::vector<Tensor>& in_tensors,
-      std::vector<Tensor>& out_tensors) override;
-
-  std::shared_ptr<ProcessGroup::Task> AllToAll(
-      std::vector<Tensor>& in, std::vector<Tensor>& out) override;
-
-  std::shared_ptr<ProcessGroup::Task> Reduce(
-      std::vector<Tensor>& tensors, const ReduceOptions& opts) override;
-
-  std::shared_ptr<ProcessGroup::Task> Scatter(std::vector<Tensor>& in_tensors,
-                                              std::vector<Tensor>& out_tensors,
-                                              const ScatterOptions&) override;
-
  protected:
   virtual std::shared_ptr<ProcessGroupHCCL::HCCLTask> CreateTask(
       std::vector<Place> places, int rank, CommType opType,
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 67715f410d443..7f21bcee87ab7 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -88,8 +88,8 @@ void SyncDefaultStream(
   for (size_t i = 0; i < places.size(); ++i) {
     auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
         platform::DeviceContextPool::Instance().Get(places[i]));
-    ncclEvents[i].Record(*dev_ctx[i]);
-    ncclEvents[i].Block(*default_ctx);
+    ncclEvents[i].Record(*default_ctx);
+    ncclEvents[i].Block(*dev_ctx[i]);
   }
 }
 
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index 59f3ea3b0a7d8..be4c5423943f5 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -13,11 +13,24 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/reducer.h"
-#include "paddle/phi/common/data_type.h"
 
 namespace paddle {
 namespace distributed {
 
+static Backend TransToBackend(platform::Place place) {
+  static const std::map<phi::AllocationType, Backend> type_backend = {
+      {phi::AllocationType::GPU, Backend::GPU},
+      {phi::AllocationType::CPU, Backend::CPU},
+  };
+
+  phi::AllocationType type = place.GetType();
+  auto it = type_backend.find(type);
+  PADDLE_ENFORCE_EQ(it != type_backend.end(), true,
+                    platform::errors::InvalidArgument(
+                        "Place type (%s) is not supported. ", place));
+  return it->second;
+}
+
 std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
     const std::vector<Tensor> tensors,
     const std::vector<bool> &is_sparse_gradient,
@@ -127,5 +140,663 @@ std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
   return res;
 }
 
+template <typename DeviceContext, typename T>
+static void ConcatTensorsForAllReduce(
+    const DeviceContext &context,
+    const std::vector<phi::DenseTensor> &dense_tensors_,
+    Tensor *p_dense_contents) {
+  operators::math::ConcatFunctor<DeviceContext, T> concat_functor_;
+  concat_functor_(
+      context, dense_tensors_, 0,
+      std::dynamic_pointer_cast<phi::DenseTensor>(p_dense_contents->impl())
+          .get());
+}
+
+template <typename DeviceContext, typename T>
+static void SplitTensorsForAllReduce(
+    const DeviceContext &context, Tensor *p_dense_contents,
+    std::vector<phi::DenseTensor> *p_dense_tensors) {
+  auto *in =
+      std::dynamic_pointer_cast<phi::DenseTensor>(p_dense_contents->impl())
+          .get();
+  std::vector<phi::DenseTensor *> outs;
+  std::vector<const phi::DenseTensor *> shape_refer;
+
+  outs.reserve(p_dense_tensors->size());
+  shape_refer.reserve(p_dense_tensors->size());
+
+  for (auto &tensor : *p_dense_tensors) {
+    outs.emplace_back(&tensor);
+    shape_refer.emplace_back(&tensor);
+  }
+
+  operators::math::SplitFunctor<DeviceContext, T> split_functor_;
+  split_functor_(context, *in, shape_refer, 0, &outs);
+}
+
+// context is used to select the stream for concat
+template <typename DeviceContext>
+static void ConcatTensorsWithType(
+    const DeviceContext &context,
+    const std::vector<phi::DenseTensor> &dense_tensors_,
+    Tensor *p_dense_contents, phi::DataType type) {
+  switch (type) {
+    case phi::DataType::FLOAT16:
+      ConcatTensorsForAllReduce<DeviceContext, platform::float16>(
+          context, dense_tensors_, p_dense_contents);
+      break;
+    case phi::DataType::FLOAT32:
+      ConcatTensorsForAllReduce<DeviceContext, float>(context, dense_tensors_,
+                                                      p_dense_contents);
+      break;
+    case phi::DataType::FLOAT64:
+      ConcatTensorsForAllReduce<DeviceContext, double>(context, dense_tensors_,
+                                                       p_dense_contents);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it concats tensors for "
+          "allreduce.",
+          type));
+  }
+}
+
+// context is used to select the stream for split
+template <typename DeviceContext>
+static void SplitTensorsWithType(const DeviceContext &context,
+                                 Tensor *p_dense_contents,
+                                 std::vector<phi::DenseTensor> *p_dense_tensors,
+                                 phi::DataType type) {
+  switch (type) {
+    case phi::DataType::FLOAT16:
+      SplitTensorsForAllReduce<DeviceContext, platform::float16>(
+          context, p_dense_contents, p_dense_tensors);
+      break;
+    case phi::DataType::FLOAT32:
+      SplitTensorsForAllReduce<DeviceContext, float>(context, p_dense_contents,
+                                                     p_dense_tensors);
+      break;
+    case phi::DataType::FLOAT64:
+      SplitTensorsForAllReduce<DeviceContext, double>(context, p_dense_contents,
+                                                      p_dense_tensors);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it splits tensors for "
+          "allreduce.",
+          type));
+  }
+}
+
+void EagerGroup::ConcatTensors(const platform::Place &place) {
+  if (platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    auto *default_ctx = static_cast<platform::CUDADeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    ConcatTensorsWithType(*default_ctx, dense_tensors_, &dense_contents_,
+                          dtype_);
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't concat grad tensors since it's not compiled with NCCL,"
+        "Please recompile or reinstall Paddle with NCCL support."));
+#endif
+  } else if (platform::is_cpu_place(place)) {
+    auto *default_ctx = static_cast<platform::CPUDeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    ConcatTensorsWithType(*default_ctx, dense_tensors_, &dense_contents_,
+                          dtype_);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Concat grad tensor not supported on place (%s)", place));
+  }
+}
+
+void EagerGroup::SplitTensors(const platform::Place &place) {
+  if (platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    auto *default_ctx = static_cast<platform::CUDADeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    SplitTensorsWithType(*default_ctx, &dense_contents_, &dense_tensors_,
+                         dtype_);
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't split grad tensor since it's not compiled with NCCL,"
+        "Please recompile or reinstall Paddle with NCCL support."));
+#endif
+  } else if (platform::is_cpu_place(place)) {
+    auto *default_ctx = static_cast<platform::CPUDeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    SplitTensorsWithType(*default_ctx, &dense_contents_, &dense_tensors_,
+                         dtype_);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Split grad tensor not supported on place (%s)", place));
+  }
+}
+
+EagerReducer::EagerReducer(
+    const std::vector<Tensor> tensors,
+    const std::vector<std::vector<size_t>> &group_indices,
+    const std::vector<bool> &is_sparse_gradient,
+    std::shared_ptr<distributed::ProcessGroup> process_group,
+    const std::vector<size_t> &group_size_limits, bool find_unused_parameters)
+    : tensors_(tensors),
+      group_indices_(group_indices),
+      is_sparse_gradient_(is_sparse_gradient),
+      process_group_(process_group),
+      group_size_limits_(group_size_limits),
+      find_unused_vars_each_step_(find_unused_parameters) {
+  VLOG(3) << "Start construct the Reducer ...";
+
+  nranks_ = process_group_->GetSize();
+
+  // initialize groups
+  InitializeGroups(group_indices);
+
+  for (size_t global_var_index = 0; global_var_index < tensors_.size();
+       ++global_var_index) {
+    auto tensor = tensors_[global_var_index];
+    auto reduce_hook = [=](void) -> void {
+      this->AddDistHook(global_var_index);
+    };
+
+    const auto &grad_node = GetGradNodeFromTensor(&tensor);
+
+    PADDLE_ENFORCE(
+        grad_node.get() != nullptr,
+        paddle::platform::errors::Fatal("Detected NULL grad_node,"
+                                        "Leaf tensor should have had grad_node "
+                                        "with type: GradNodeAccumulation"));
+    const auto &accumulation_grad_node =
+        std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
+    accumulation_grad_node->RegisterReduceHook(
+        std::make_shared<egr::CppTensorVoidHook>(reduce_hook));
+
+    gradnode_index_map_[grad_node.get()] = global_var_index;
+  }
+
+  vars_marked_ready_.resize(tensors_.size(), false);
+  local_used_vars_.resize(tensors_.size(), 0);
+
+  if (find_unused_vars_each_step_) {
+    global_used_vars_ = paddle::experimental::empty(
+        ScalarArray({static_cast<int32_t>(tensors_.size())}), DataType::INT32,
+        TransToBackend(inner_place_));
+  }
+}
+
+std::shared_ptr<egr::GradNodeBase> EagerReducer::GetGradNodeFromTensor(
+    Tensor *tensor) {
+  auto *autograd_meta = tensor->get_autograd_meta();
+  const auto &grad_node =
+      static_cast<egr::AutogradMeta *>(autograd_meta)->GetMutableGradNode();
+  return grad_node;
+}
+
+void EagerReducer::InitializeGroups(
+    const std::vector<std::vector<size_t>> &group_indices) {
+  VLOG(3) << "Start initialize groups ..";
+
+  // clear the group
+  groups_.clear();
+  groups_.reserve(group_indices.size());
+
+  variable_locators_.clear();
+  variable_locators_.resize(tensors_.size());
+
+  auto group_nums = group_indices.size();
+  for (size_t group_index = 0; group_index < group_nums; ++group_index) {
+    const auto &tensor_indices_ = group_indices[group_index];
+    PADDLE_ENFORCE_GT(
+        tensor_indices_.size(), 0,
+        platform::errors::PreconditionNotMet(
+            "The number of group[%d]'s elements is 0.", group_index));
+
+    EagerGroup group;
+
+    // It's just for check the sparse or dense
+    auto first_var = tensors_[tensor_indices_.front()];
+    if (tensor_indices_.size() == 1 &&
+        is_sparse_gradient_[tensor_indices_.front()]) {
+      // process the sparse gradient. one sparse, one group
+      group.dtype_ = first_var.dtype();
+    } else {
+      // process the dense gradient.
+      InitializeDenseGroups(tensor_indices_, &group);
+      // experimental::Backend backend =  TransToBackend(inner_place_);
+      group.dense_contents_ = paddle::experimental::empty(
+          ScalarArray({group.all_length_}), group.dtype_,
+          TransToBackend(inner_place_));
+    }
+
+    // map tensors to this group by VariableLocator
+    size_t inside_group_index = 0;
+    for (const auto var_index : tensor_indices_) {
+      TensorLocator tensor_locator;
+      tensor_locator.group_index = group_index;
+      tensor_locator.inside_group_index = inside_group_index++;
+      variable_locators_[var_index] = tensor_locator;
+    }
+    group.tensor_indices_ = std::move(tensor_indices_);
+    groups_.emplace_back(std::move(group));
+
+    VLOG(3) << "The Group[" << group_index << "]:" << groups_.back();
+  }
+}
+
+void EagerReducer::InitializeDenseGroups(
+    const std::vector<size_t> &tensor_indices_, EagerGroup *p_group) {
+  VLOG(3) << "InitializeDenseGroups.";
+  int64_t all_length = 0;
+  for (size_t index = 0; index < tensor_indices_.size(); ++index) {
+    auto tensor_index = tensor_indices_[index];
+    auto &tensor = tensors_[tensor_index];
+    auto &tensor_name = tensor.name();
+
+    PADDLE_ENFORCE_EQ(tensor.is_initialized(), true,
+                      platform::errors::PreconditionNotMet(
+                          "Tensor %s is not initialized.", tensor_name));
+    const auto size = tensor.numel();
+    PADDLE_ENFORCE_GT(
+        size, 0, platform::errors::PreconditionNotMet(
+                     "The number of tensor %s's elements is 0.", tensor_name));
+    all_length += size;
+
+    p_group->length_.push_back(size);
+
+    // for concat operator
+    p_group->origin_shapes_.push_back(ScalarArray(tensor.shape()));
+    p_group->dense_tensors_.push_back(phi::DenseTensor());
+
+    const auto &dtype = tensor.dtype();
+    const auto &place = tensor.place();
+    const auto &inner_place = tensor.impl()->place();
+    if (index > 0) {
+      PADDLE_ENFORCE_EQ(dtype, p_group->dtype_,
+                        platform::errors::PreconditionNotMet(
+                            "Tensor %s has unexpected dtype.", tensor_name));
+      PADDLE_ENFORCE_EQ(place, place_,
+                        platform::errors::PreconditionNotMet(
+                            "Tensor %s has different place. Expected place is "
+                            "%s, but actual place is %s",
+                            tensor_name, inner_place_, inner_place));
+    } else {
+      p_group->dtype_ = dtype;
+      place_ = place;
+      inner_place_ = inner_place;
+    }
+  }
+  p_group->all_length_ = all_length;
+}
+
+void EagerReducer::TraverseBackwardGraph(const std::vector<Tensor> &outputs) {
+  std::queue<egr::GradNodeBase *> queue;
+  std::set<egr::GradNodeBase *> visited;
+
+  for (const auto &output : outputs) {
+    auto *auto_grad_meta =
+        static_cast<egr::AutogradMeta *>(output.get_autograd_meta());
+    if (!auto_grad_meta) continue;
+    auto shared_grad_node = auto_grad_meta->GetMutableGradNode();
+    if (shared_grad_node == nullptr || shared_grad_node.get() == nullptr ||
+        auto_grad_meta->StopGradient()) {
+      continue;
+    }
+    egr::GradNodeBase *grad_node = shared_grad_node.get();
+    queue.emplace(grad_node);
+  }
+
+  while (!queue.empty()) {
+    egr::GradNodeBase *node = queue.front();
+    queue.pop();
+    const std::vector<std::vector<egr::Edge>> &edges = node->GetEdges();
+    for (size_t i = 0; i < edges.size(); i++) {
+      for (size_t j = 0; j < edges[i].size(); j++) {
+        const egr::Edge &edge = edges[i][j];
+        auto next_node_shared = edge.GetMutableGradNode();
+        if (!next_node_shared || !next_node_shared.get()) {
+          continue;
+        }
+        auto *next_node = next_node_shared.get();
+        const bool was_inserted = visited.insert(next_node).second;
+        if (was_inserted) {
+          queue.emplace(next_node);
+        }
+      }
+    }
+  }
+
+  for (const auto &it : gradnode_index_map_) {
+    if (visited.count(it.first) == 0) {
+      unused_vars_.push_back(it.second);
+      VLOG(3) << "[Rank " << process_group_->GetRank() << "]: "
+              << "Tensor " << tensors_[it.second].name() << " at index "
+              << it.second << " is marked as unused.";
+    }
+  }
+}
+
+void EagerReducer::PrepareForBackward(const std::vector<Tensor> &outputs) {
+  VLOG(3) << "after forward, then reset count for backward.";
+  grad_need_hooks_ = true;
+  next_group_ = 0;
+  std::for_each(groups_.begin(), groups_.end(), [](EagerGroup &group) {
+    group.pending_ = group.tensor_indices_.size();
+  });
+
+  // reinitialize vars_marked_ready_ for next iteration
+  vars_marked_ready_.clear();
+  vars_marked_ready_.resize(tensors_.size(), false);
+
+  PADDLE_ENFORCE_EQ(
+      groups_need_finalize_, false,
+      platform::errors::PreconditionNotMet(
+          "A serious error has occurred here. Please "
+          "set find_unused_parameters=True to traverse backward graph "
+          "in each step to prepare reduce in advance. If you have "
+          "set, There may be several reasons for this error: "
+          "1) Please note that all forward outputs derived from the module "
+          "parameters must participate in the calculation of losses and "
+          "subsequent gradient calculations. If not, the wrapper will hang, "
+          "waiting for autograd to generate gradients for these parameters. "
+          "you can use detach or stop_gradient to make the unused parameters "
+          "detached from the autograd graph. "
+          "2) Used multiple forwards and one backward. You may be able to wrap "
+          "multiple forwards in a model."));
+
+  // The first var to trigger the unused parameter
+  has_marked_unused_vars_ = false;
+
+  if (find_unused_vars_once_ || find_unused_vars_each_step_) {
+    unused_vars_.clear();
+    TraverseBackwardGraph(outputs);
+    // only check once in first step
+    find_unused_vars_once_ = false;
+  }
+
+  if (find_unused_vars_each_step_ && unused_vars_.empty()) {
+    LOG_FIRST_N(WARNING, 1)
+        << "All parameters are involved in the backward pass. "
+           "It is recommended to set find_unused_parameters to False "
+           "to improve performance. However, if unused parameters "
+           "appear in subsequent iterative training, then an error "
+           "will occur. Please make it clear that in the subsequent "
+           "training, there will be no parameters that are not used "
+           "in the backward pass, and then set find_unused_parameters";
+  }
+
+  if (unused_vars_.size() == tensors_.size()) {
+    LOG_FIRST_N(WARNING, 1)
+        << "There is no parameter in the device involved "
+           "in the backward calculation. If there are "
+           "parameters on other devices involved in the "
+           "backward, then a serious error will occur here.";
+  }
+}
+
+void EagerReducer::AddDistHook(size_t var_index) {
+  PADDLE_ENFORCE_LT(var_index, variable_locators_.size(),
+                    platform::errors::OutOfRange(
+                        "Out of bounds variable index. it must be less"
+                        "than %d, but it is %d",
+                        variable_locators_.size(), var_index));
+
+  // gradient synchronization is not required when grad_need_hooks_ is false.
+  if (!grad_need_hooks_) {
+    return;
+  }
+
+  auto &tensor = tensors_[var_index];
+  const auto &grad_node = GetGradNodeFromTensor(&tensor);
+
+  VLOG(3) << "Tensor[" << var_index << "] [" << tensors_[var_index].name()
+          << "@Grad] arrived and triggered disthook";
+
+  local_used_vars_[var_index] = 1;
+
+  if (!has_marked_unused_vars_) {
+    has_marked_unused_vars_ = true;
+    for (const auto unused_index : unused_vars_) {
+      MarkVarReady(unused_index, false);
+    }
+  }
+  MarkVarReady(var_index, true);
+}
+
+void EagerReducer::MarkVarReady(const size_t var_index,
+                                const bool is_used_var) {
+  VLOG(3) << "Tensor[" << var_index << "][" << tensors_[var_index].name()
+          << "] is marked ready.";
+  // error happened, if the var is ready before.
+  if (vars_marked_ready_[var_index]) {
+    auto error_info = string::Sprintf(
+        "Error happened, when parameter[%d][%s] has been ready before. "
+        "Please set find_unused_parameters=True to traverse backward graph "
+        "in each step to prepare reduce in advance. If you have set, "
+        "there may be several reasons for this error: "
+        "1) In multiple reentrant backward phase, some parameters are reused."
+        "2) Using model parameters outside of forward function. Please "
+        "make sure that model parameters are not shared in concurrent "
+        "forward-backward passes.",
+        var_index, tensors_[var_index].name());
+
+    PADDLE_ENFORCE_EQ(has_marked_unused_vars_, false,
+                      platform::errors::PreconditionNotMet(error_info));
+
+    error_info +=
+        "3) Unused parameters retrieval is incorrect. "
+        "The return value of forward will be used to retrieve"
+        " the unused parameters of the entire model. These "
+        "gradients of unused parameters will not be synchronized "
+        "between multiple cards. However, if the unused "
+        "parameters participate in the backward calculation "
+        "again at a later time (e.g. after the forward function, "
+        "the loss calculation uses the unused "
+        "paramters of the forward and trigger backward), "
+        "its gradient will be wrong.";
+
+    PADDLE_ENFORCE_EQ(has_marked_unused_vars_, true,
+                      platform::errors::PreconditionNotMet(error_info));
+  } else {
+    vars_marked_ready_[var_index] = true;
+  }
+  groups_need_finalize_ = true;
+
+  const auto &var_locator = variable_locators_[var_index];
+  const auto group_index = var_locator.group_index;
+  const auto inside_group_index = var_locator.inside_group_index;
+
+  auto &group = groups_[group_index];
+  auto &group_tensor = group.dense_tensors_[inside_group_index];
+  const auto length = group.length_[inside_group_index];
+
+  if (is_used_var) {
+    auto *autograd_meta = tensors_[var_index].get_autograd_meta();
+    auto &grad_tensor = static_cast<egr::AutogradMeta *>(autograd_meta)->Grad();
+    group_tensor
+        .ShareDataWith(
+            *(std::dynamic_pointer_cast<phi::DenseTensor>(grad_tensor.impl())))
+        .Resize({grad_tensor.numel()});
+  } else {
+    // TODO(shenliang03): maybe save the memory by avoiding tensor construction
+    if (!group_tensor.initialized()) {
+      group_tensor.Resize({static_cast<int64_t>(length)});
+      group_tensor.mutable_data(inner_place_, group.dtype_);
+    }
+    if (HasGrad(var_index)) {
+      VLOG(3) << "Tensor[" << tensors_[var_index].name() << "] has grad";
+      auto grad_tensor = egr::EagerUtils::mutable_grad(tensors_[var_index]);
+      group_tensor
+          .ShareDataWith(*(
+              std::dynamic_pointer_cast<phi::DenseTensor>(grad_tensor->impl())))
+          .Resize({length});
+    } else {
+      VLOG(3) << "Tensor[" << tensors_[var_index].name()
+              << "] doesn't have grad";
+      auto *dev_ctx = platform::DeviceContextPool::Instance().Get(inner_place_);
+      group_tensor.Resize({static_cast<int64_t>(length)});
+      phi::funcs::set_constant(*dev_ctx, &group_tensor, 0.0);
+    }
+  }
+
+  if (--group.pending_ == 0) {
+    // can start allreduce
+    MarkGroupReady(group_index);
+  }
+
+  if (next_group_ == groups_.size()) {
+    FinalizeBackward();
+  }
+}
+
+void EagerReducer::MarkGroupReady(size_t group_index) {
+  VLOG(3) << "Group[" << group_index << "] is ready";
+
+  PADDLE_ENFORCE_GE(
+      group_index, next_group_,
+      platform::errors::PreconditionNotMet(
+          "The index of the incoming group must be greater "
+          "than or equal to the previously synchronized group index, "
+          "expect it to greater than or equal to %d, but got %d.",
+          next_group_, group_index));
+
+  if (group_index > next_group_) {
+    VLOG(3) << "It will adjust the order of group in next batch automatically";
+    return;
+  }
+
+  for (; next_group_ < groups_.size() && groups_[next_group_].pending_ == 0;
+       ++next_group_) {
+    UNUSED auto &group = groups_[next_group_];
+    FusedAllReduceSchedule(&group, next_group_);
+  }
+}
+
+bool EagerReducer::HasGrad(size_t var_index) {
+  auto grad = egr::EagerUtils::mutable_grad(tensors_[var_index]);
+  if (grad && grad->is_initialized()) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void EagerReducer::ProcessUnusedDenseVars() {
+  // The calculation stream must be used here to
+  // avoid conflicts with communication.
+  VLOG(3) << "Local used vars : "
+          << string::join_strings(local_used_vars_, ',');
+
+  const auto *dev_ctx =
+      platform::DeviceContextPool::Instance().Get(inner_place_);
+  auto *global_used_tensor =
+      std::dynamic_pointer_cast<phi::DenseTensor>(global_used_vars_.impl())
+          .get();
+  framework::TensorFromVector<int32_t>(local_used_vars_, *dev_ctx,
+                                       global_used_tensor);
+
+  distributed::AllreduceOptions opts;
+  opts.reduce_op = ReduceOp::SUM;
+  std::vector<Tensor> reduce_tensors = {global_used_vars_};
+  process_group_->AllReduce(reduce_tensors, opts)->Synchronize();
+
+  framework::TensorToVector<int>(*global_used_tensor, *dev_ctx,
+                                 &local_used_vars_);
+  dev_ctx->Wait();
+
+  // sync compute stream to get global used var message,
+  // but maybe affect speed performance
+  VLOG(3) << "Global used vars : "
+          << string::join_strings(local_used_vars_, ',');
+
+  for (const auto var_index : unused_vars_) {
+    const bool global_unused = (local_used_vars_[var_index] == 0);
+
+    // global used but local unused, set grad
+    VLOG(3) << "[Rank " << process_group_->GetRank() << "]: "
+            << "Var [" << var_index << "] [" << tensors_[var_index].name()
+            << "] global_unused: " << global_unused
+            << "  has grad: " << HasGrad(var_index);
+
+    if (!global_unused) {
+      VLOG(3) << "Set Tensor[" << var_index << "]'s Grad for [Rank "
+              << process_group_->GetRank() << "]";
+      const auto &var_locator = variable_locators_[var_index];
+      const auto group_index = var_locator.group_index;
+      const auto &group = groups_[group_index];
+      const auto inside_group_index = var_locator.inside_group_index;
+      auto &src_tensor = group.dense_tensors_[inside_group_index];
+
+      Tensor grad_value(std::make_shared<phi::DenseTensor>(src_tensor));
+
+      auto dest_var_base = tensors_[var_index];
+      auto grad_tensor = egr::EagerUtils::mutable_grad(dest_var_base);
+      grad_tensor->copy_(grad_value, inner_place_, true);
+      grad_tensor->reshape(dest_var_base.shape());
+    }
+  }
+}
+
+void EagerReducer::FinalizeBackward() {
+  groups_need_finalize_ = false;
+  grad_need_hooks_ = false;
+  for (auto &group : groups_) {
+    group.task->Synchronize();
+  }
+
+  for (auto &group : groups_) {
+    group.SplitTensors(inner_place_);
+  }
+
+  if (find_unused_vars_each_step_) {
+    ProcessUnusedDenseVars();
+    local_used_vars_.clear();
+    local_used_vars_.resize(tensors_.size(), 0);
+    VLOG(3) << "ProcessUnusedDenseVars is finished.";
+  }
+
+  VLOG(3) << "In the batch, Reducer is finished.";
+}
+
+void EagerReducer::FusedAllReduceSchedule(EagerGroup *group,
+                                          const int curr_group_index) {
+  // The overall timeline: concat > div_nranks > allreduce > split
+  distributed::AllreduceOptions opts;
+  opts.reduce_op = ReduceOp::SUM;
+
+  VLOG(3) << "group [" << curr_group_index << "] start fused_allreduce.";
+
+  // concat tensors
+  group->ConcatTensors(inner_place_);
+
+  // div nranks
+  paddle::experimental::scale_(group->dense_contents_, 1.0 / nranks_, 0.0,
+                               false);
+
+  // all_reduce
+  std::vector<Tensor> reduce_tensors = {group->dense_contents_};
+  group->task = process_group_->AllReduce(reduce_tensors, opts);
+
+  // split in FinalizeBackward()
+}
+
+std::ostream &operator<<(std::ostream &out, const EagerGroup &group) {
+  const auto &tensors_ = group.tensor_indices_;
+  out << "numel: " << group.all_length_ << " ;var number: " << tensors_.size()
+      << "\n";
+  auto begin = tensors_.begin();
+  auto end = tensors_.end();
+  out << "[";
+  for (int i = 0; begin != end && i < 100; ++i, ++begin) {
+    if (i > 0) out << ' ';
+    out << *begin;
+  }
+  if (begin != end) {
+    out << " ...";
+  }
+  out << "]\n";
+  return out;
+}
+
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h
index f8c75385ef8bd..d3ffa8498a14b 100644
--- a/paddle/fluid/distributed/collective/reducer.h
+++ b/paddle/fluid/distributed/collective/reducer.h
@@ -17,16 +17,126 @@
 #include <map>
 #include <vector>
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/utils/hook_utils.h"
 #include "paddle/fluid/eager/api/utils/tensor_utils.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/api/lib/ext_compat_utils.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/utils/string/string_helper.h"
 
 namespace paddle {
 namespace distributed {
 using Tensor = paddle::experimental::Tensor;
+using Scalar = paddle::experimental::ScalarBase<paddle::experimental::Tensor>;
+using ScalarArray =
+    paddle::experimental::ScalarArrayBase<paddle::experimental::Tensor>;
+using Backend = paddle::experimental::Backend;
 
 std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
-    const std::vector<Tensor>, const std::vector<bool>& is_sparse_gradient,
-    const std::vector<size_t>& group_size_limits,
-    const std::vector<int64_t>& tensor_indices = {});
+    const std::vector<Tensor>, const std::vector<bool> &is_sparse_gradient,
+    const std::vector<size_t> &group_size_limits,
+    const std::vector<int64_t> &tensor_indices = {});
+
+class EagerGroup {
+ public:
+  Tensor dense_contents_;
+
+  // for concat kernel
+  std::vector<phi::DenseTensor> dense_tensors_;
+  std::vector<int64_t> length_;
+  int64_t all_length_{0};
+  std::vector<ScalarArray> origin_shapes_;
+
+  // Global indices of participating tensors in the group
+  std::vector<size_t> tensor_indices_;
+
+  // Number of params that haven't been ready. When it is 0, it means
+  // the group is ready.
+  size_t pending_ = -1;
+
+  // external message of group
+  phi::DataType dtype_;
+
+  // help to sync
+  std::shared_ptr<ProcessGroup::Task> task;
+
+  // context is used to select the stream for concat
+  void ConcatTensors(const platform::Place &);
+
+  // context is used to select the stream for split
+  void SplitTensors(const platform::Place &);
+
+  friend std::ostream &operator<<(std::ostream &, const EagerGroup &);
+};
+
+struct TensorLocator {
+  // record the index in groups_
+  size_t group_index;
+  size_t inside_group_index;
+};
+
+class EagerReducer {
+ public:
+  explicit EagerReducer(
+      const std::vector<Tensor> tensors,
+      const std::vector<std::vector<size_t>> &group_indices,
+      const std::vector<bool> &is_sparse_gradient,
+      std::shared_ptr<distributed::ProcessGroup> process_group,
+      const std::vector<size_t> &group_size_limits,
+      bool find_unused_parameters);
+
+  virtual ~EagerReducer() {}
+
+  std::shared_ptr<egr::GradNodeBase> GetGradNodeFromTensor(Tensor *tensor);
+
+  void InitializeGroups(const std::vector<std::vector<size_t>> &group_indices);
+  void InitializeDenseGroups(const std::vector<size_t> &tensor_indices_,
+                             EagerGroup *p_group);
+  void PrepareForBackward(const std::vector<Tensor> &outputs);
+  void AddDistHook(size_t var_index);
+  void MarkVarReady(const size_t var_index, const bool is_used_var);
+  void MarkGroupReady(const size_t group_index);
+  void FusedAllReduceSchedule(EagerGroup *group, const int curr_group_index);
+  void FinalizeBackward();
+  void TraverseBackwardGraph(const std::vector<Tensor> &outputs);
+  void ProcessUnusedDenseVars();
+  bool HasGrad(size_t var_index);
+
+ private:
+  std::vector<Tensor> tensors_;
+  std::vector<std::vector<size_t>> group_indices_;
+  std::vector<bool> is_sparse_gradient_;
+  std::shared_ptr<distributed::ProcessGroup> process_group_;
+  std::vector<size_t> group_size_limits_;
+
+  std::vector<EagerGroup> groups_;
+  std::vector<TensorLocator> variable_locators_;
+  PlaceType place_;
+  platform::Place inner_place_;
+  size_t next_group_ = 0;
+  int64_t nranks_ = -1;
+
+  bool grad_need_hooks_{false};
+
+  std::vector<bool> vars_marked_ready_;
+  std::vector<int32_t> local_used_vars_;
+
+  // Following variables are to help unused vars
+  std::vector<size_t> unused_vars_;
+  std::map<egr::GradNodeBase *, size_t> gradnode_index_map_;
+  bool has_marked_unused_vars_{false};
+  bool find_unused_vars_each_step_{false};
+  bool find_unused_vars_once_{true};
+  bool groups_need_finalize_{false};
+  Tensor global_used_vars_;
+};
 
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
index 3e734b1b9ed24..8641b36a1be8e 100644
--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -4,7 +4,7 @@ if(WITH_PYTHON)
 endif()
 proto_library(interceptor_message_proto SRCS interceptor_message.proto)
 
-if(WITH_DISTRIBUTE AND WITH_PSCORE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
+if(WITH_DISTRIBUTE AND WITH_PSCORE)
   set(BRPC_DEPS brpc ssl crypto protobuf zlib leveldb snappy gflags glog)
 else()
   set(BRPC_DEPS "")
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc
index 8d2ec5c41d864..80a6b4667aa1a 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -67,8 +67,7 @@ bool MessageBus::IsInit() const { return is_init_; }
 
 MessageBus::~MessageBus() {
   VLOG(3) << "Message bus releases resource.";
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   server_.Stop(1000);
   server_.Join();
 #endif
@@ -87,8 +86,7 @@ bool MessageBus::Send(int64_t dst_rank,
       IsInit(), true,
       platform::errors::PreconditionNotMet(
           "Using message bus since it has not been initialized."));
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   int retry_time = 0;  // message bus will retry sending for 10 times
   while (retry_time < 10) {
     ++retry_time;
@@ -173,8 +171,7 @@ void MessageBus::ListenPort() {
     LOG(INFO) << "No need listen to port since training on single card.";
     return;
   }
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   // function keep listen the port and handle the message
   PADDLE_ENFORCE_EQ(
       server_.AddService(&message_service_, brpc::SERVER_DOESNT_OWN_SERVICE), 0,
@@ -203,8 +200,7 @@ void MessageBus::ListenPort() {
 #endif
 }
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 bool MessageBus::SendInterRank(int64_t dst_rank,
                                const InterceptorMessage& interceptor_message) {
   const auto& dst_addr = GetAddr(dst_rank);
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.h b/paddle/fluid/distributed/fleet_executor/message_bus.h
index d805ac81606b8..dfd65fdbc00d4 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.h
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.h
@@ -20,8 +20,7 @@
 #include <thread>
 #include <unordered_map>
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "brpc/channel.h"
 #include "brpc/server.h"
 #include "paddle/fluid/distributed/fleet_executor/message_service.h"
@@ -64,8 +63,7 @@ class MessageBus final {
 
   const std::string& GetAddr(int64_t rank) const;
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   // send the message inter rank (dst is different rank with src)
   bool SendInterRank(int64_t dst_rank,
                      const InterceptorMessage& interceptor_message);
@@ -81,8 +79,7 @@ class MessageBus final {
   // the ip needs to be listened
   std::string addr_;
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   MessageServiceImpl message_service_;
   // brpc server
   brpc::Server server_;
diff --git a/paddle/fluid/distributed/fleet_executor/message_service.cc b/paddle/fluid/distributed/fleet_executor/message_service.cc
index c3fff98f684ad..1c66d83ea34d7 100644
--- a/paddle/fluid/distributed/fleet_executor/message_service.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_service.cc
@@ -11,8 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/fleet_executor/message_service.h"
 #include "brpc/server.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
diff --git a/paddle/fluid/distributed/fleet_executor/message_service.h b/paddle/fluid/distributed/fleet_executor/message_service.h
index 02f73471e3b91..5ab687ff93dc4 100644
--- a/paddle/fluid/distributed/fleet_executor/message_service.h
+++ b/paddle/fluid/distributed/fleet_executor/message_service.h
@@ -11,8 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #pragma once
 
 #include "brpc/server.h"
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
index 18920d06f3854..ba039385a74ba 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
@@ -24,10 +24,14 @@ limitations under the License. */
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(fill_constant);
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace distributed {
 
diff --git a/paddle/fluid/distributed/ps.proto b/paddle/fluid/distributed/ps.proto
index 0ae87812bce43..fac30e26c388c 100644
--- a/paddle/fluid/distributed/ps.proto
+++ b/paddle/fluid/distributed/ps.proto
@@ -115,6 +115,7 @@ message TableParameter {
   optional CommonAccessorParameter common = 6;
   optional TableType type = 7;
   optional bool compress_in_save = 8 [ default = false ];
+  optional GraphParameter graph_parameter = 9;
 }
 
 message TableAccessorParameter {
@@ -211,3 +212,25 @@ message SparseAdamSGDParameter { // SparseAdamSGDRule
   optional double ada_epsilon = 5 [ default = 1e-08 ];
   repeated float weight_bounds = 6;
 }
+
+message GraphParameter {
+  optional int32 task_pool_size = 1 [ default = 24 ];
+  optional bool gpups_mode = 2 [ default = false ];
+  optional string gpups_graph_sample_class = 3
+      [ default = "CompleteGraphSampler" ];
+  optional string gpups_graph_sample_args = 4 [ default = "" ];
+  optional bool use_cache = 5 [ default = true ];
+  optional float cache_ratio = 6 [ default = 0.3 ];
+  optional int32 cache_ttl = 7 [ default = 5 ];
+  optional GraphFeature graph_feature = 8;
+  optional string table_name = 9 [ default = "" ];
+  optional string table_type = 10 [ default = "" ];
+  optional int32 gpups_mode_shard_num = 11 [ default = 127 ];
+  optional int32 gpu_num = 12 [ default = 1 ];
+}
+
+message GraphFeature {
+  repeated string name = 1;
+  repeated string dtype = 2;
+  repeated int32 shape = 3;
+}
\ No newline at end of file
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
index b8ccd8e744dab..f86b4b706b3e2 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
@@ -414,6 +414,16 @@ std::future<int32_t> BrpcPsClient::load(uint32_t table_id,
   return send_cmd(table_id, PS_LOAD_ONE_TABLE, {epoch, mode});
 }
 
+std::future<int32_t> BrpcPsClient::Load(const LoadSaveContext &load_context) {
+  if (load_context.table_id < 0) {
+    return send_cmd(-1, PS_LOAD_ALL_TABLE,
+                    {load_context.epoch, load_context.mode});
+  } else {
+    return send_cmd(load_context.table_id, PS_LOAD_ONE_TABLE,
+                    {load_context.epoch, load_context.mode});
+  }
+}
+
 std::future<int32_t> BrpcPsClient::save(const std::string &epoch,
                                         const std::string &mode) {
   VLOG(1) << "BrpcPsClient::save path " << epoch;
@@ -427,6 +437,19 @@ std::future<int32_t> BrpcPsClient::save(uint32_t table_id,
   return send_save_cmd(table_id, PS_SAVE_ONE_TABLE, {epoch, mode});
 }
 
+std::future<int32_t> BrpcPsClient::Save(const LoadSaveContext &save_context) {
+  if (save_context.table_id < 0) {
+    VLOG(1) << "BrpcPsClient::save path " << save_context.epoch;
+    return send_save_cmd(-1, PS_SAVE_ALL_TABLE,
+                         {save_context.epoch, save_context.mode});
+  } else {
+    VLOG(1) << "BrpcPsClient::save one table path " << save_context.epoch
+            << " table_id " << save_context.table_id;
+    return send_save_cmd(save_context.table_id, PS_SAVE_ONE_TABLE,
+                         {save_context.epoch, save_context.mode});
+  }
+}
+
 std::future<int32_t> BrpcPsClient::clear() {
   return send_cmd(-1, PS_CLEAR_ALL_TABLE, {});
 }
@@ -505,6 +528,44 @@ std::future<int32_t> BrpcPsClient::barrier(size_t table_id,
   return send_cmd(table_id, PS_BARRIER, {std::to_string(barrier_type)});
 }
 
+std::future<int32_t> BrpcPsClient::Pull(RequestContext &pull_context) {
+  if (pull_context.value_type == Dense) {  // pull dense
+    Region *dense_region =
+        reinterpret_cast<Region *>(pull_context.dense_values);
+    pull_dense(dense_region, pull_context.num, pull_context.table);
+  } else {  // pull sparse
+    uint64_t *keys = reinterpret_cast<uint64_t *>(pull_context.keys);
+    float **select_values =
+        reinterpret_cast<float **>(pull_context.sparse_values);
+    size_t table_id = pull_context.table;
+    size_t num = pull_context.num;
+    bool is_training = pull_context.is_training;
+    if (pull_context.training_mode == Geo) {  // for geo
+      pull_sparse_param(select_values, table_id, keys, num, is_training);
+    } else if (pull_context.training_mode == Async) {  // for async
+      pull_sparse(select_values, table_id, keys, num, is_training);
+    }
+  }
+}
+
+std::future<int32_t> BrpcPsClient::Push(RequestContext &push_context) {
+  if (push_context.value_type == Dense) {  // push dense
+    const Region *dense_region = push_context.push_context.push_dense_values;
+    push_dense(dense_region, push_context.num, push_context.table);
+  } else {  // push sparse
+    size_t table_id = push_context.table;
+    size_t num = push_context.num;
+    bool is_training = push_context.is_training;
+    if (push_context.training_mode == Geo) {  // for geo
+      // TODO(zhaocaibei)
+    } else if (push_context.training_mode == Async) {  // for async
+      const uint64_t *keys = push_context.push_context.keys;
+      const float **update_values = push_context.push_context.push_values;
+      push_sparse(table_id, keys, update_values, num);
+    }
+  }
+}
+
 std::future<int32_t> BrpcPsClient::pull_geo_param(size_t table_id,
                                                   std::vector<float> *values,
                                                   std::vector<uint64_t> *keys,
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.h b/paddle/fluid/distributed/ps/service/brpc_ps_client.h
index 59ed59933db86..8b0cb0741b400 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.h
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.h
@@ -163,12 +163,17 @@ class BrpcPsClient : public PSClient {
   std::future<int32_t> load(uint32_t table_id, const std::string &epoch,
                             const std::string &mode) override;
 
+  std::future<int32_t> Load(const LoadSaveContext &load_context) override;
+
   std::future<int32_t> save(const std::string &epoch,
                             const std::string &mode) override;
 
   std::future<int32_t> save(uint32_t table_id, const std::string &epoch,
                             const std::string &mode) override;
 
+  virtual std::future<int32_t> Save(
+      const LoadSaveContext &save_context) override;
+
   std::future<int32_t> clear() override;
 
   std::future<int32_t> clear(uint32_t table_id) override;
@@ -199,6 +204,10 @@ class BrpcPsClient : public PSClient {
                                                  const uint64_t *keys,
                                                  size_t num, bool is_training);
 
+  virtual std::future<int32_t> Pull(RequestContext &pull_context) override;
+
+  virtual std::future<int32_t> Push(RequestContext &push_context) override;
+
   virtual std::future<int32_t> print_table_stat(uint32_t table_id);
 
   virtual std::future<int32_t> barrier(size_t table_id, uint32_t barrier_type);
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_server.h b/paddle/fluid/distributed/ps/service/brpc_ps_server.h
index 4310c247438ce..d81a3a5df07f1 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_server.h
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_server.h
@@ -51,7 +51,7 @@ class BrpcPsServer : public PSServer {
     _server.Join();
     return 0;
   }
-  virtual int32_t port();
+  int32_t port();
 
  private:
   virtual int32_t initialize();
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
index 301708f6b7bb3..a3db88e3b679d 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
@@ -44,7 +44,7 @@ void GraphPsService_Stub::service(
   }
 }
 
-int GraphBrpcClient::get_server_index_by_id(uint64_t id) {
+int GraphBrpcClient::get_server_index_by_id(int64_t id) {
   int shard_num = get_shard_num();
   int shard_per_server = shard_num % server_size == 0
                              ? shard_num / server_size
@@ -53,7 +53,7 @@ int GraphBrpcClient::get_server_index_by_id(uint64_t id) {
 }
 
 std::future<int32_t> GraphBrpcClient::get_node_feat(
-    const uint32_t &table_id, const std::vector<uint64_t> &node_ids,
+    const uint32_t &table_id, const std::vector<int64_t> &node_ids,
     const std::vector<std::string> &feature_names,
     std::vector<std::vector<std::string>> &res) {
   std::vector<int> request2server;
@@ -66,7 +66,7 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
     }
   }
   size_t request_call_num = request2server.size();
-  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
   for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
     int server_index = get_server_index_by_id(node_ids[query_idx]);
@@ -129,7 +129,7 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
 
     closure->request(request_idx)
         ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     std::string joint_feature_name =
         paddle::string::join_strings(feature_names, '\t');
     closure->request(request_idx)
@@ -179,9 +179,9 @@ std::future<int32_t> GraphBrpcClient::clear_nodes(uint32_t table_id) {
   return fut;
 }
 std::future<int32_t> GraphBrpcClient::add_graph_node(
-    uint32_t table_id, std::vector<uint64_t> &node_id_list,
+    uint32_t table_id, std::vector<int64_t> &node_id_list,
     std::vector<bool> &is_weighted_list) {
-  std::vector<std::vector<uint64_t>> request_bucket;
+  std::vector<std::vector<int64_t>> request_bucket;
   std::vector<std::vector<bool>> is_weighted_bucket;
   bool add_weight = is_weighted_list.size() > 0;
   std::vector<int> server_index_arr;
@@ -191,7 +191,7 @@ std::future<int32_t> GraphBrpcClient::add_graph_node(
     if (index_mapping[server_index] == -1) {
       index_mapping[server_index] = request_bucket.size();
       server_index_arr.push_back(server_index);
-      request_bucket.push_back(std::vector<uint64_t>());
+      request_bucket.push_back(std::vector<int64_t>());
       if (add_weight) is_weighted_bucket.push_back(std::vector<bool>());
     }
     request_bucket[index_mapping[server_index]].push_back(
@@ -229,7 +229,7 @@ std::future<int32_t> GraphBrpcClient::add_graph_node(
     size_t node_num = request_bucket[request_idx].size();
     closure->request(request_idx)
         ->add_params((char *)request_bucket[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     if (add_weight) {
       bool weighted[is_weighted_bucket[request_idx].size() + 1];
       for (size_t j = 0; j < is_weighted_bucket[request_idx].size(); j++)
@@ -248,8 +248,8 @@ std::future<int32_t> GraphBrpcClient::add_graph_node(
   return fut;
 }
 std::future<int32_t> GraphBrpcClient::remove_graph_node(
-    uint32_t table_id, std::vector<uint64_t> &node_id_list) {
-  std::vector<std::vector<uint64_t>> request_bucket;
+    uint32_t table_id, std::vector<int64_t> &node_id_list) {
+  std::vector<std::vector<int64_t>> request_bucket;
   std::vector<int> server_index_arr;
   std::vector<int> index_mapping(server_size, -1);
   for (size_t query_idx = 0; query_idx < node_id_list.size(); ++query_idx) {
@@ -257,7 +257,7 @@ std::future<int32_t> GraphBrpcClient::remove_graph_node(
     if (index_mapping[server_index] == -1) {
       index_mapping[server_index] = request_bucket.size();
       server_index_arr.push_back(server_index);
-      request_bucket.push_back(std::vector<uint64_t>());
+      request_bucket.push_back(std::vector<int64_t>());
     }
     request_bucket[index_mapping[server_index]].push_back(
         node_id_list[query_idx]);
@@ -291,7 +291,7 @@ std::future<int32_t> GraphBrpcClient::remove_graph_node(
 
     closure->request(request_idx)
         ->add_params((char *)request_bucket[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     // PsService_Stub rpc_stub(get_cmd_channel(server_index));
     GraphPsService_Stub rpc_stub =
         getServiceStub(get_cmd_channel(server_index));
@@ -303,9 +303,9 @@ std::future<int32_t> GraphBrpcClient::remove_graph_node(
 }
 // char* &buffer,int &actual_size
 std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
-    uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
-    // std::vector<std::vector<std::pair<uint64_t, float>>> &res,
-    std::vector<std::vector<uint64_t>> &res,
+    uint32_t table_id, std::vector<int64_t> node_ids, int sample_size,
+    // std::vector<std::vector<std::pair<int64_t, float>>> &res,
+    std::vector<std::vector<int64_t>> &res,
     std::vector<std::vector<float>> &res_weight, bool need_weight,
     int server_index) {
   if (server_index != -1) {
@@ -337,7 +337,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
           int start = 0;
           while (start < actual_size) {
             res[node_idx].emplace_back(
-                *(uint64_t *)(node_buffer + offset + start));
+                *(int64_t *)(node_buffer + offset + start));
             start += GraphNode::id_size;
             if (need_weight) {
               res_weight[node_idx].emplace_back(
@@ -358,7 +358,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
     closure->request(0)->set_table_id(table_id);
     closure->request(0)->set_client_id(_client_id);
     closure->request(0)->add_params((char *)node_ids.data(),
-                                    sizeof(uint64_t) * node_ids.size());
+                                    sizeof(int64_t) * node_ids.size());
     closure->request(0)->add_params((char *)&sample_size, sizeof(int));
     closure->request(0)->add_params((char *)&need_weight, sizeof(bool));
     ;
@@ -380,14 +380,14 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
       server2request[server_index] = request2server.size();
       request2server.push_back(server_index);
     }
-    // res.push_back(std::vector<std::pair<uint64_t, float>>());
+    // res.push_back(std::vector<std::pair<int64_t, float>>());
     res.push_back({});
     if (need_weight) {
       res_weight.push_back({});
     }
   }
   size_t request_call_num = request2server.size();
-  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
   for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
     int server_index = get_server_index_by_id(node_ids[query_idx]);
@@ -428,7 +428,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
               int start = 0;
               while (start < actual_size) {
                 res[query_idx].emplace_back(
-                    *(uint64_t *)(node_buffer + offset + start));
+                    *(int64_t *)(node_buffer + offset + start));
                 start += GraphNode::id_size;
                 if (need_weight) {
                   res_weight[query_idx].emplace_back(
@@ -459,7 +459,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
 
     closure->request(request_idx)
         ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     closure->request(request_idx)
         ->add_params((char *)&sample_size, sizeof(int));
     closure->request(request_idx)
@@ -476,7 +476,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
 }
 std::future<int32_t> GraphBrpcClient::random_sample_nodes(
     uint32_t table_id, int server_index, int sample_size,
-    std::vector<uint64_t> &ids) {
+    std::vector<int64_t> &ids) {
   DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) {
     int ret = 0;
     auto *closure = (DownpourBrpcClosure *)done;
@@ -490,7 +490,7 @@ std::future<int32_t> GraphBrpcClient::random_sample_nodes(
       auto size = io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
       int index = 0;
       while (index < bytes_size) {
-        ids.push_back(*(uint64_t *)(buffer + index));
+        ids.push_back(*(int64_t *)(buffer + index));
         index += GraphNode::id_size;
       }
       delete[] buffer;
@@ -633,7 +633,7 @@ std::future<int32_t> GraphBrpcClient::pull_graph_list(
 }
 
 std::future<int32_t> GraphBrpcClient::set_node_feat(
-    const uint32_t &table_id, const std::vector<uint64_t> &node_ids,
+    const uint32_t &table_id, const std::vector<int64_t> &node_ids,
     const std::vector<std::string> &feature_names,
     const std::vector<std::vector<std::string>> &features) {
   std::vector<int> request2server;
@@ -646,7 +646,7 @@ std::future<int32_t> GraphBrpcClient::set_node_feat(
     }
   }
   size_t request_call_num = request2server.size();
-  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
   std::vector<std::vector<std::vector<std::string>>> features_idx_buckets(
       request_call_num);
@@ -696,7 +696,7 @@ std::future<int32_t> GraphBrpcClient::set_node_feat(
 
     closure->request(request_idx)
         ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     std::string joint_feature_name =
         paddle::string::join_strings(feature_names, '\t');
     closure->request(request_idx)
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.h b/paddle/fluid/distributed/ps/service/graph_brpc_client.h
index 06e753d028baa..e2b8a518615dc 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_client.h
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.h
@@ -63,8 +63,8 @@ class GraphBrpcClient : public BrpcPsClient {
   virtual ~GraphBrpcClient() {}
   // given a batch of nodes, sample graph_neighbors for each of them
   virtual std::future<int32_t> batch_sample_neighbors(
-      uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
-      std::vector<std::vector<uint64_t>>& res,
+      uint32_t table_id, std::vector<int64_t> node_ids, int sample_size,
+      std::vector<std::vector<int64_t>>& res,
       std::vector<std::vector<float>>& res_weight, bool need_weight,
       int server_index = -1);
 
@@ -75,20 +75,20 @@ class GraphBrpcClient : public BrpcPsClient {
   virtual std::future<int32_t> random_sample_nodes(uint32_t table_id,
                                                    int server_index,
                                                    int sample_size,
-                                                   std::vector<uint64_t>& ids);
+                                                   std::vector<int64_t>& ids);
   virtual std::future<int32_t> get_node_feat(
-      const uint32_t& table_id, const std::vector<uint64_t>& node_ids,
+      const uint32_t& table_id, const std::vector<int64_t>& node_ids,
       const std::vector<std::string>& feature_names,
       std::vector<std::vector<std::string>>& res);
 
   virtual std::future<int32_t> set_node_feat(
-      const uint32_t& table_id, const std::vector<uint64_t>& node_ids,
+      const uint32_t& table_id, const std::vector<int64_t>& node_ids,
       const std::vector<std::string>& feature_names,
       const std::vector<std::vector<std::string>>& features);
 
   virtual std::future<int32_t> clear_nodes(uint32_t table_id);
   virtual std::future<int32_t> add_graph_node(
-      uint32_t table_id, std::vector<uint64_t>& node_id_list,
+      uint32_t table_id, std::vector<int64_t>& node_id_list,
       std::vector<bool>& is_weighted_list);
   virtual std::future<int32_t> use_neighbors_sample_cache(uint32_t table_id,
                                                           size_t size_limit,
@@ -96,11 +96,11 @@ class GraphBrpcClient : public BrpcPsClient {
   virtual std::future<int32_t> load_graph_split_config(uint32_t table_id,
                                                        std::string path);
   virtual std::future<int32_t> remove_graph_node(
-      uint32_t table_id, std::vector<uint64_t>& node_id_list);
+      uint32_t table_id, std::vector<int64_t>& node_id_list);
   virtual int32_t initialize();
   int get_shard_num() { return shard_num; }
   void set_shard_num(int shard_num) { this->shard_num = shard_num; }
-  int get_server_index_by_id(uint64_t id);
+  int get_server_index_by_id(int64_t id);
   void set_local_channel(int index) {
     this->local_channel = get_cmd_channel(index);
   }
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
index 441f489fb3097..20a55e4d11983 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
@@ -140,9 +140,9 @@ int32_t GraphBrpcService::add_graph_node(Table *table,
     return 0;
   }
 
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  std::vector<int64_t> node_ids(node_data, node_data + node_num);
   std::vector<bool> is_weighted_list;
   if (request.params_size() == 2) {
     size_t weight_list_size = request.params(1).size() / sizeof(bool);
@@ -165,9 +165,9 @@ int32_t GraphBrpcService::remove_graph_node(Table *table,
         "graph_get_node_feat request requires at least 1 argument");
     return 0;
   }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  std::vector<int64_t> node_ids(node_data, node_data + node_num);
 
   ((GraphTable *)table)->remove_graph_node(node_ids);
   return 0;
@@ -386,9 +386,9 @@ int32_t GraphBrpcService::graph_random_sample_neighbors(
         "graph_random_sample_neighbors request requires at least 3 arguments");
     return 0;
   }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  int sample_size = *(uint64_t *)(request.params(1).c_str());
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  int sample_size = *(int64_t *)(request.params(1).c_str());
   bool need_weight = *(bool *)(request.params(2).c_str());
   std::vector<std::shared_ptr<char>> buffers(node_num);
   std::vector<int> actual_sizes(node_num, 0);
@@ -407,7 +407,7 @@ int32_t GraphBrpcService::graph_random_sample_neighbors(
 int32_t GraphBrpcService::graph_random_sample_nodes(
     Table *table, const PsRequestMessage &request, PsResponseMessage &response,
     brpc::Controller *cntl) {
-  size_t size = *(uint64_t *)(request.params(0).c_str());
+  size_t size = *(int64_t *)(request.params(0).c_str());
   std::unique_ptr<char[]> buffer;
   int actual_size;
   if (((GraphTable *)table)->random_sample_nodes(size, buffer, actual_size) ==
@@ -430,9 +430,9 @@ int32_t GraphBrpcService::graph_get_node_feat(Table *table,
         "graph_get_node_feat request requires at least 2 arguments");
     return 0;
   }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  std::vector<int64_t> node_ids(node_data, node_data + node_num);
 
   std::vector<std::string> feature_names =
       paddle::string::split_string<std::string>(request.params(1), "\t");
@@ -464,16 +464,16 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
                       "at least 3 arguments");
     return 0;
   }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t),
+  size_t node_num = request.params(0).size() / sizeof(int64_t),
          size_of_size_t = sizeof(size_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  int sample_size = *(uint64_t *)(request.params(1).c_str());
-  bool need_weight = *(uint64_t *)(request.params(2).c_str());
-  // std::vector<uint64_t> res = ((GraphTable
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  int sample_size = *(int64_t *)(request.params(1).c_str());
+  bool need_weight = *(int64_t *)(request.params(2).c_str());
+  // std::vector<int64_t> res = ((GraphTable
   // *)table).filter_out_non_exist_nodes(node_data, sample_size);
   std::vector<int> request2server;
   std::vector<int> server2request(server_size, -1);
-  std::vector<uint64_t> local_id;
+  std::vector<int64_t> local_id;
   std::vector<int> local_query_idx;
   size_t rank = get_rank();
   for (int query_idx = 0; query_idx < node_num; ++query_idx) {
@@ -496,7 +496,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
   std::vector<std::shared_ptr<char>> local_buffers;
   std::vector<int> local_actual_sizes;
   std::vector<size_t> seq;
-  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
   for (int query_idx = 0; query_idx < node_num; ++query_idx) {
     int server_index =
@@ -583,7 +583,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
 
     closure->request(request_idx)
         ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     closure->request(request_idx)
         ->add_params((char *)&sample_size, sizeof(int));
     closure->request(request_idx)
@@ -618,9 +618,9 @@ int32_t GraphBrpcService::graph_set_node_feat(Table *table,
         "graph_set_node_feat request requires at least 3 arguments");
     return 0;
   }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  std::vector<int64_t> node_ids(node_data, node_data + node_num);
 
   std::vector<std::string> feature_names =
       paddle::string::split_string<std::string>(request.params(1), "\t");
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.h b/paddle/fluid/distributed/ps/service/graph_brpc_server.h
index aee0190850753..a978d97b296b0 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_server.h
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.h
@@ -43,7 +43,7 @@ class GraphBrpcServer : public PSServer {
     _server.Join();
     return 0;
   }
-  virtual int32_t port();
+  int32_t port();
 
   std::condition_variable *export_cv() { return &cv_; }
 
diff --git a/paddle/fluid/distributed/ps/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h
index 21719fbdbf1d6..8a2bfbe31602b 100644
--- a/paddle/fluid/distributed/ps/service/ps_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_client.h
@@ -26,6 +26,7 @@
 #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 #include "paddle/fluid/platform/timer.h"
 
 namespace paddle {
@@ -59,6 +60,41 @@ class PSClientClosure : public google::protobuf::Closure {
   std::vector<std::shared_ptr<std::promise<int32_t>>> _promises;
 };
 
+struct LoadSaveContext {
+  int table_id;
+  std::string epoch;
+  std::string mode;
+};
+
+enum TrainingMode { Async = 0, Sync = 1, Geo = 3 };
+
+enum TrainingPhase { Init = 0, Train = 1, Save = 2 };
+
+// enum ValueType {
+//   Sparse = 0,
+//   Dense = 1
+// };
+
+struct PushContext {
+  const uint64_t *keys;
+  const float **push_values;
+  const Region *push_dense_values;
+};
+
+struct RequestContext {
+  int table;
+  TrainingMode training_mode;    // 1 for async, 2 for geo, 3 for sync
+  TrainingPhase training_phase;  // 1 for init, 2 for train
+  ValueType value_type;          // 1 for sparse, 2 for dense
+  void *keys;
+  void **sparse_values;  // for sparse values
+  Region *dense_values;  // for dense values
+  PushContext push_context;
+  size_t num;
+  bool is_training;
+  void *callback;
+};
+
 class PSClient {
  public:
   PSClient() {}
@@ -86,6 +122,9 @@ class PSClient {
   // 指定table数据load
   virtual std::future<int32_t> load(uint32_t table_id, const std::string &epoch,
                                     const std::string &mode) = 0;
+  // context配置load选项
+  virtual std::future<int32_t> Load(const LoadSaveContext &load_context) = 0;
+
   // 全量table数据save  value_accessor根据mode，可能有不同的save条件
   virtual std::future<int32_t> save(const std::string &epoch,
                                     const std::string &mode) = 0;
@@ -93,6 +132,8 @@ class PSClient {
   virtual std::future<int32_t> save(uint32_t table_id, const std::string &epoch,
                                     const std::string &mode) = 0;
 
+  virtual std::future<int32_t> Save(const LoadSaveContext &save_context) = 0;
+
   // 清空table数据
   virtual std::future<int32_t> clear() = 0;
   virtual std::future<int32_t> clear(uint32_t table_id) = 0;
@@ -107,6 +148,8 @@ class PSClient {
   virtual std::future<int32_t> pull_dense(Region *regions, size_t region_num,
                                           size_t table_id) = 0;  // 保留
 
+  virtual std::future<int32_t> Push(RequestContext &push_context) = 0;
+
   // firstly push dense param for parameter server
   // this is neccessary because dense weight initialized in trainer on cold
   // start
@@ -117,6 +160,9 @@ class PSClient {
   virtual std::future<int32_t> push_dense(const Region *regions,
                                           size_t region_num,
                                           size_t table_id) = 0;
+
+  virtual std::future<int32_t> Pull(RequestContext &pull_context) = 0;
+
   // 使用keys进行pull请求，结果填充values
   // keys和values的个数均为num个，每个value占用select_size空间
   // future结束前keys和values缓冲区不能再次使用
diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.cc b/paddle/fluid/distributed/ps/service/ps_local_client.cc
index 972cce135f189..9e364b6d3ed7a 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_client.cc
+++ b/paddle/fluid/distributed/ps/service/ps_local_client.cc
@@ -56,6 +56,19 @@ ::std::future<int32_t> PsLocalClient::load(uint32_t table_id,
   return done();
 }
 
+std::future<int32_t> PsLocalClient::Load(const LoadSaveContext& load_context) {
+  if (load_context.table_id < 0) {
+    for (auto& it : _table_map) {
+      load(it.first, load_context.epoch, load_context.mode);
+    }
+    return done();
+  } else {
+    auto* table_ptr = table(load_context.table_id);
+    table_ptr->load(load_context.epoch, load_context.mode);
+    return done();
+  }
+}
+
 ::std::future<int32_t> PsLocalClient::save(const std::string& epoch,
                                            const std::string& mode) {
   // TODO
@@ -74,6 +87,21 @@ ::std::future<int32_t> PsLocalClient::save(uint32_t table_id,
   return done();
 }
 
+::std::future<int32_t> PsLocalClient::Save(
+    const LoadSaveContext& save_context) {
+  if (save_context.table_id < 0) {
+    for (auto& it : _table_map) {
+      save(it.first, save_context.epoch, save_context.mode);
+    }
+    return done();
+  } else {
+    auto* table_ptr = table(save_context.table_id);
+    table_ptr->flush();
+    table_ptr->save(save_context.epoch, save_context.mode);
+    return done();
+  }
+}
+
 ::std::future<int32_t> PsLocalClient::clear() {
   // TODO
   return done();
@@ -93,6 +121,51 @@ ::std::future<int32_t> PsLocalClient::stop_server() {
   return done();
 }
 
+::std::future<int32_t> PsLocalClient::Pull(RequestContext& pull_context) {
+  if (pull_context.value_type == Dense) {  // pull dense
+    Region* dense_region = reinterpret_cast<Region*>(pull_context.dense_values);
+    pull_dense(dense_region, pull_context.num, pull_context.table);
+  } else {  // pull sparse
+    uint64_t* keys = reinterpret_cast<uint64_t*>(pull_context.keys);
+    char** select_values = reinterpret_cast<char**>(pull_context.sparse_values);
+    size_t table_id = pull_context.table;
+    size_t num = pull_context.num;
+    pull_sparse_ptr(select_values, table_id, keys, num);
+  }
+}
+
+::std::future<int32_t> PsLocalClient::Push(RequestContext& push_context) {
+  if (push_context.value_type == Dense) {  // push dense
+    if (push_context.training_phase == Init) {
+      const Region* regions = push_context.push_context.push_dense_values;
+      size_t region_num = push_context.num;
+      push_dense_param(regions, region_num, push_context.table);
+    } else {
+      if (push_context.training_mode == Geo) {  // geo
+        float* total_send_data =
+            reinterpret_cast<float*>(push_context.dense_values);
+        size_t total_send_data_size = push_context.num;
+        push_dense_raw_gradient(push_context.table, total_send_data,
+                                total_send_data_size, push_context.callback);
+      } else {  // async and sync
+        const Region* regions = push_context.push_context.push_dense_values;
+        size_t region_num = push_context.num;
+        push_dense(regions, region_num, push_context.table);
+      }
+    }
+  } else {  // push sparse
+    if (push_context.training_mode == Async) {
+      const uint64_t* keys = push_context.push_context.keys;
+      const float** update_values = push_context.push_context.push_values;
+      size_t table_id = push_context.table;
+      size_t num = push_context.num;
+      push_sparse(table_id, keys, update_values, num);
+    } else {
+      // TODO
+    }
+  }
+}
+
 ::std::future<int32_t> PsLocalClient::pull_dense(Region* regions,
                                                  size_t region_num,
                                                  size_t table_id) {
diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.h b/paddle/fluid/distributed/ps/service/ps_local_client.h
index e73974ac56286..83ca558e3d2cb 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_local_client.h
@@ -39,12 +39,16 @@ class PsLocalClient : public PSClient {
   virtual ::std::future<int32_t> load(uint32_t table_id,
                                       const std::string& epoch,
                                       const std::string& mode) override;
+  virtual std::future<int32_t> Load(
+      const LoadSaveContext& load_context) override;
 
   virtual ::std::future<int32_t> save(const std::string& epoch,
                                       const std::string& mode) override;
   virtual ::std::future<int32_t> save(uint32_t table_id,
                                       const std::string& epoch,
                                       const std::string& mode) override;
+  virtual std::future<int32_t> Save(
+      const LoadSaveContext& save_context) override;
 
   virtual ::std::future<int32_t> clear() override;
   virtual ::std::future<int32_t> clear(uint32_t table_id) override;
@@ -55,6 +59,10 @@ class PsLocalClient : public PSClient {
   virtual ::std::future<int32_t> pull_dense(Region* regions, size_t region_num,
                                             size_t table_id);
 
+  virtual ::std::future<int32_t> Pull(RequestContext& pull_context) override;
+
+  virtual ::std::future<int32_t> Push(RequestContext& push_context) override;
+
   virtual ::std::future<int32_t> push_dense(const Region* regions,
                                             size_t region_num, size_t table_id);
 
diff --git a/paddle/fluid/distributed/ps/service/ps_local_server.h b/paddle/fluid/distributed/ps/service/ps_local_server.h
index 91f8bc4c91271..31b52126fc576 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_server.h
+++ b/paddle/fluid/distributed/ps/service/ps_local_server.h
@@ -28,7 +28,6 @@ class PsLocalServer : public PSServer {
   virtual uint64_t start() { return 0; }
   virtual uint64_t start(const std::string &ip, uint32_t port) { return 0; }
   virtual int32_t stop() { return 0; }
-  virtual int32_t port() { return 0; }
   virtual int32_t configure(
       const PSParameter &config, PSEnvironment &env, size_t server_rank,
       const std::vector<framework::ProgramDesc> &server_sub_program = {}) {
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
index 088edcb75bbc6..c8be0f7971090 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
@@ -44,9 +44,9 @@ void GraphPyService::add_table_feat_conf(std::string table_name,
   }
 }
 
-void add_graph_node(std::vector<uint64_t> node_ids,
+void add_graph_node(std::vector<int64_t> node_ids,
                     std::vector<bool> weight_list) {}
-void remove_graph_node(std::vector<uint64_t> node_ids) {}
+void remove_graph_node(std::vector<int64_t> node_ids) {}
 void GraphPyService::set_up(std::string ips_str, int shard_num,
                             std::vector<std::string> node_types,
                             std::vector<std::string> edge_types) {
@@ -260,7 +260,7 @@ void GraphPyClient::clear_nodes(std::string name) {
 }
 
 void GraphPyClient::add_graph_node(std::string name,
-                                   std::vector<uint64_t>& node_ids,
+                                   std::vector<int64_t>& node_ids,
                                    std::vector<bool>& weight_list) {
   if (this->table_id_map.count(name)) {
     uint32_t table_id = this->table_id_map[name];
@@ -271,7 +271,7 @@ void GraphPyClient::add_graph_node(std::string name,
 }
 
 void GraphPyClient::remove_graph_node(std::string name,
-                                      std::vector<uint64_t>& node_ids) {
+                                      std::vector<int64_t>& node_ids) {
   if (this->table_id_map.count(name)) {
     uint32_t table_id = this->table_id_map[name];
     auto status = get_ps_client()->remove_graph_node(table_id, node_ids);
@@ -290,13 +290,12 @@ void GraphPyClient::load_node_file(std::string name, std::string filepath) {
   }
 }
 
-std::pair<std::vector<std::vector<uint64_t>>, std::vector<float>>
+std::pair<std::vector<std::vector<int64_t>>, std::vector<float>>
 GraphPyClient::batch_sample_neighbors(std::string name,
-                                      std::vector<uint64_t> node_ids,
+                                      std::vector<int64_t> node_ids,
                                       int sample_size, bool return_weight,
                                       bool return_edges) {
-  // std::vector<std::vector<std::pair<uint64_t, float>>> v;
-  std::vector<std::vector<uint64_t>> v;
+  std::vector<std::vector<int64_t>> v;
   std::vector<std::vector<float>> v1;
   if (this->table_id_map.count(name)) {
     uint32_t table_id = this->table_id_map[name];
@@ -309,7 +308,7 @@ GraphPyClient::batch_sample_neighbors(std::string name,
   // res.first[1]: slice index
   // res.first[2]: src nodes
   // res.second: edges weight
-  std::pair<std::vector<std::vector<uint64_t>>, std::vector<float>> res;
+  std::pair<std::vector<std::vector<int64_t>>, std::vector<float>> res;
   res.first.push_back({});
   res.first.push_back({});
   if (return_edges) res.first.push_back({});
@@ -342,10 +341,10 @@ void GraphPyClient::use_neighbors_sample_cache(std::string name,
     status.wait();
   }
 }
-std::vector<uint64_t> GraphPyClient::random_sample_nodes(std::string name,
-                                                         int server_index,
-                                                         int sample_size) {
-  std::vector<uint64_t> v;
+std::vector<int64_t> GraphPyClient::random_sample_nodes(std::string name,
+                                                        int server_index,
+                                                        int sample_size) {
+  std::vector<int64_t> v;
   if (this->table_id_map.count(name)) {
     uint32_t table_id = this->table_id_map[name];
     auto status =
@@ -357,7 +356,7 @@ std::vector<uint64_t> GraphPyClient::random_sample_nodes(std::string name,
 
 // (name, dtype, ndarray)
 std::vector<std::vector<std::string>> GraphPyClient::get_node_feat(
-    std::string node_type, std::vector<uint64_t> node_ids,
+    std::string node_type, std::vector<int64_t> node_ids,
     std::vector<std::string> feature_names) {
   std::vector<std::vector<std::string>> v(
       feature_names.size(), std::vector<std::string>(node_ids.size()));
@@ -371,7 +370,7 @@ std::vector<std::vector<std::string>> GraphPyClient::get_node_feat(
 }
 
 void GraphPyClient::set_node_feat(
-    std::string node_type, std::vector<uint64_t> node_ids,
+    std::string node_type, std::vector<int64_t> node_ids,
     std::vector<std::string> feature_names,
     const std::vector<std::vector<std::string>> features) {
   if (this->table_id_map.count(node_type)) {
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
index c25ef5035453d..85707137c1800 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
@@ -70,18 +70,34 @@ class GraphPyService {
     ::paddle::distributed::TableAccessorParameter* accessor_proto =
         sparse_table_proto->mutable_accessor();
 
-    ::paddle::distributed::CommonAccessorParameter* common_proto =
-        sparse_table_proto->mutable_common();
+    // ::paddle::distributed::CommonAccessorParameter* common_proto =
+    //     sparse_table_proto->mutable_common();
 
+    ::paddle::distributed::GraphParameter* graph_proto =
+        sparse_table_proto->mutable_graph_parameter();
+
+    ::paddle::distributed::GraphFeature* graph_feature =
+        graph_proto->mutable_graph_feature();
+
+    graph_proto->set_task_pool_size(24);
+
+    graph_proto->set_table_name(table_name);
+    graph_proto->set_table_type(table_type);
+    graph_proto->set_use_cache(false);
     // Set GraphTable Parameter
-    common_proto->set_table_name(table_name);
-    common_proto->set_name(table_type);
+    // common_proto->set_table_name(table_name);
+    // common_proto->set_name(table_type);
+    // for (size_t i = 0; i < feat_name.size(); i++) {
+    //   common_proto->add_params(feat_dtype[i]);
+    //   common_proto->add_dims(feat_shape[i]);
+    //   common_proto->add_attributes(feat_name[i]);
+    // }
+
     for (size_t i = 0; i < feat_name.size(); i++) {
-      common_proto->add_params(feat_dtype[i]);
-      common_proto->add_dims(feat_shape[i]);
-      common_proto->add_attributes(feat_name[i]);
+      graph_feature->add_dtype(feat_dtype[i]);
+      graph_feature->add_shape(feat_shape[i]);
+      graph_feature->add_name(feat_name[i]);
     }
-
     accessor_proto->set_accessor_class("CommMergeAccessor");
   }
 
@@ -143,24 +159,24 @@ class GraphPyClient : public GraphPyService {
   void load_edge_file(std::string name, std::string filepath, bool reverse);
   void load_node_file(std::string name, std::string filepath);
   void clear_nodes(std::string name);
-  void add_graph_node(std::string name, std::vector<uint64_t>& node_ids,
+  void add_graph_node(std::string name, std::vector<int64_t>& node_ids,
                       std::vector<bool>& weight_list);
-  void remove_graph_node(std::string name, std::vector<uint64_t>& node_ids);
+  void remove_graph_node(std::string name, std::vector<int64_t>& node_ids);
   int get_client_id() { return client_id; }
   void set_client_id(int client_id) { this->client_id = client_id; }
   void start_client();
-  std::pair<std::vector<std::vector<uint64_t>>, std::vector<float>>
-  batch_sample_neighbors(std::string name, std::vector<uint64_t> node_ids,
+  std::pair<std::vector<std::vector<int64_t>>, std::vector<float>>
+  batch_sample_neighbors(std::string name, std::vector<int64_t> node_ids,
                          int sample_size, bool return_weight,
                          bool return_edges);
-  std::vector<uint64_t> random_sample_nodes(std::string name, int server_index,
-                                            int sample_size);
+  std::vector<int64_t> random_sample_nodes(std::string name, int server_index,
+                                           int sample_size);
   std::vector<std::vector<std::string>> get_node_feat(
-      std::string node_type, std::vector<uint64_t> node_ids,
+      std::string node_type, std::vector<int64_t> node_ids,
       std::vector<std::string> feature_names);
   void use_neighbors_sample_cache(std::string name, size_t total_size_limit,
                                   size_t ttl);
-  void set_node_feat(std::string node_type, std::vector<uint64_t> node_ids,
+  void set_node_feat(std::string node_type, std::vector<int64_t> node_ids,
                      std::vector<std::string> feature_names,
                      const std::vector<std::vector<std::string>> features);
   std::vector<FeatureNode> pull_graph_list(std::string name, int server_index,
diff --git a/paddle/fluid/distributed/ps/service/server.cc b/paddle/fluid/distributed/ps/service/server.cc
index 5f1974e3e610c..893f671359e40 100644
--- a/paddle/fluid/distributed/ps/service/server.cc
+++ b/paddle/fluid/distributed/ps/service/server.cc
@@ -67,8 +67,6 @@ int32_t PSServer::configure(
   _config = config.server_param();
   _rank = server_rank;
   _environment = &env;
-  _shuffled_ins =
-      paddle::framework::MakeChannel<std::pair<uint64_t, std::string>>();
   size_t shard_num = env.get_ps_servers().size();
 
   const auto &downpour_param = _config.downpour_server_param();
diff --git a/paddle/fluid/distributed/ps/service/server.h b/paddle/fluid/distributed/ps/service/server.h
index 160d4a6128295..d2804405b4198 100644
--- a/paddle/fluid/distributed/ps/service/server.h
+++ b/paddle/fluid/distributed/ps/service/server.h
@@ -69,11 +69,6 @@ class PSServer {
       const PSParameter &config, PSEnvironment &env, size_t server_rank,
       const std::vector<framework::ProgramDesc> &server_sub_program = {});
 
-  // return server_ip
-  virtual std::string ip() { return butil::my_ip_cstr(); }
-  // return server_port
-  virtual int32_t port() = 0;
-
   virtual uint64_t start(const std::string &ip, uint32_t port) = 0;
   virtual int32_t stop() = 0;
 
@@ -94,15 +89,6 @@ class PSServer {
     return &_table_map;
   }
 
-  typedef std::function<int32_t(int, int, const std::string &)> MsgHandlerFunc;
-  virtual int registe_pserver2pserver_msg_handler(int msg_type,
-                                                  MsgHandlerFunc handler) {
-    _msg_handler_map[msg_type] = handler;
-    return 0;
-  }
-
-  paddle::framework::Channel<std::pair<uint64_t, std::string>> _shuffled_ins;
-
  protected:
   virtual int32_t initialize() = 0;
 
@@ -111,7 +97,6 @@ class PSServer {
   ServerParameter _config;
   PSEnvironment *_environment;
   std::unordered_map<uint32_t, std::shared_ptr<Table>> _table_map;
-  std::unordered_map<int32_t, MsgHandlerFunc> _msg_handler_map;
 
  protected:
   std::shared_ptr<framework::Scope> scope_;
diff --git a/paddle/fluid/distributed/ps/table/CMakeLists.txt b/paddle/fluid/distributed/ps/table/CMakeLists.txt
index be916bf2e8003..2fa5ecb4051c5 100644
--- a/paddle/fluid/distributed/ps/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt
@@ -53,7 +53,6 @@ cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_pro
 
 set_source_files_properties(memory_sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_library(memory_sparse_geo_table SRCS memory_sparse_geo_table.cc DEPS ps_framework_proto ${TABLE_DEPS} common_table)
-
 cc_library(table SRCS table.cc DEPS memory_sparse_table memory_sparse_geo_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost)
 
 target_link_libraries(table -fopenmp)
diff --git a/paddle/fluid/distributed/ps/table/accessor.h b/paddle/fluid/distributed/ps/table/accessor.h
index 7c91a60864980..07c211bb9c128 100644
--- a/paddle/fluid/distributed/ps/table/accessor.h
+++ b/paddle/fluid/distributed/ps/table/accessor.h
@@ -45,6 +45,17 @@ struct DataConverter {
   std::string deconverter;
 };
 
+struct AccessorInfo {
+  size_t dim;
+  size_t size;
+  size_t select_size;
+  size_t select_dim;
+  size_t update_size;
+  size_t update_dim;
+  size_t mf_size;
+  size_t fea_dim;
+};
+
 class ValueAccessor {
  public:
   ValueAccessor() {}
@@ -68,6 +79,8 @@ class ValueAccessor {
   }
   virtual int initialize() = 0;
 
+  virtual void GetTableInfo(AccessorInfo& info) = 0;
+
   // value维度
   virtual size_t dim() = 0;
   // value各个维度的size
@@ -163,6 +176,7 @@ class ValueAccessor {
   TableAccessorParameter _config;
   std::unordered_map<int, std::shared_ptr<struct DataConverter>>
       _data_coverter_map;
+  AccessorInfo _accessor_info;
 };
 REGISTER_PSCORE_REGISTERER(ValueAccessor);
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/ps/table/common_dense_table.cc b/paddle/fluid/distributed/ps/table/common_dense_table.cc
index 607469e2f7b0d..cc0f5867a3d65 100644
--- a/paddle/fluid/distributed/ps/table/common_dense_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_dense_table.cc
@@ -128,6 +128,21 @@ int32_t CommonDenseTable::set_global_lr(float* lr) {
   return 0;
 }
 
+int32_t CommonDenseTable::Pull(TableContext& context) {
+  CHECK(context.value_type == Dense);
+  float* pull_values = context.pull_context.values;
+  return pull_dense(pull_values, context.num);
+}
+
+int32_t CommonDenseTable::Push(TableContext& context) {
+  CHECK(context.value_type == Dense);
+  if (context.pull_context.values != nullptr) {
+    const float* values = context.push_context.values;
+    return push_dense(values, context.num);
+  }
+  return 0;
+}
+
 int32_t CommonDenseTable::pull_dense(float* pull_values, size_t num) {
   std::copy(values_[param_idx_].begin(), values_[param_idx_].end(),
             pull_values);
diff --git a/paddle/fluid/distributed/ps/table/common_dense_table.h b/paddle/fluid/distributed/ps/table/common_dense_table.h
index a4c0f29ddb877..cad49a0a449c4 100644
--- a/paddle/fluid/distributed/ps/table/common_dense_table.h
+++ b/paddle/fluid/distributed/ps/table/common_dense_table.h
@@ -40,6 +40,8 @@ class CommonDenseTable : public DenseTable {
                                   const std::string& name);
   virtual int32_t initialize_value();
   virtual int32_t initialize_optimizer();
+  virtual int32_t Pull(TableContext& context);
+  virtual int32_t Push(TableContext& context);
   int32_t pull_dense(float* pull_values, size_t num) override;
   int32_t push_dense_param(const float* values, size_t num) override;
   int32_t push_dense(const float* values, size_t num) override;
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc
index 54b98cb96ce51..2c07bd65d63d4 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -27,6 +27,288 @@
 namespace paddle {
 namespace distributed {
 
+#ifdef PADDLE_WITH_HETERPS
+
+int CompleteGraphSampler::run_graph_sampling() {
+  pthread_rwlock_t *rw_lock = graph_table->rw_lock.get();
+  pthread_rwlock_rdlock(rw_lock);
+  std::cout << "in graph sampling" << std::endl;
+  sample_nodes.clear();
+  sample_neighbors.clear();
+  sample_res.clear();
+  sample_nodes.resize(gpu_num);
+  sample_neighbors.resize(gpu_num);
+  sample_res.resize(gpu_num);
+  std::vector<std::vector<std::vector<paddle::framework::GpuPsGraphNode>>>
+      sample_nodes_ex(graph_table->task_pool_size_);
+  std::vector<std::vector<std::vector<int64_t>>> sample_neighbors_ex(
+      graph_table->task_pool_size_);
+  for (int i = 0; i < graph_table->task_pool_size_; i++) {
+    sample_nodes_ex[i].resize(gpu_num);
+    sample_neighbors_ex[i].resize(gpu_num);
+  }
+  std::vector<std::future<int>> tasks;
+  for (size_t i = 0; i < graph_table->shards.size(); ++i) {
+    tasks.push_back(
+        graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
+            ->enqueue([&, i, this]() -> int {
+              if (this->status == GraphSamplerStatus::terminating) return 0;
+              paddle::framework::GpuPsGraphNode node;
+              std::vector<Node *> &v =
+                  this->graph_table->shards[i]->get_bucket();
+              size_t ind = i % this->graph_table->task_pool_size_;
+              for (size_t j = 0; j < v.size(); j++) {
+                size_t location = v[j]->get_id() % this->gpu_num;
+                node.node_id = v[j]->get_id();
+                node.neighbor_size = v[j]->get_neighbor_size();
+                node.neighbor_offset =
+                    (int)sample_neighbors_ex[ind][location].size();
+                sample_nodes_ex[ind][location].emplace_back(node);
+                for (int k = 0; k < node.neighbor_size; k++)
+                  sample_neighbors_ex[ind][location].push_back(
+                      v[j]->get_neighbor_id(k));
+              }
+              return 0;
+            }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+  tasks.clear();
+  for (size_t i = 0; i < gpu_num; i++) {
+    tasks.push_back(
+        graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
+            ->enqueue([&, i, this]() -> int {
+              if (this->status == GraphSamplerStatus::terminating) return 0;
+              int total_offset = 0;
+              size_t ind = i % this->graph_table->task_pool_size_;
+              for (int j = 0; j < this->graph_table->task_pool_size_; j++) {
+                for (size_t k = 0; k < sample_nodes_ex[j][ind].size(); k++) {
+                  sample_nodes[ind].push_back(sample_nodes_ex[j][ind][k]);
+                  sample_nodes[ind].back().neighbor_offset += total_offset;
+                }
+                size_t neighbor_size = sample_neighbors_ex[j][ind].size();
+                total_offset += neighbor_size;
+                for (size_t k = 0; k < neighbor_size; k++) {
+                  sample_neighbors[ind].push_back(
+                      sample_neighbors_ex[j][ind][k]);
+                }
+              }
+              return 0;
+            }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+
+  if (this->status == GraphSamplerStatus::terminating) {
+    pthread_rwlock_unlock(rw_lock);
+    return 0;
+  }
+  for (size_t i = 0; i < gpu_num; i++) {
+    sample_res[i].node_list = sample_nodes[i].data();
+    sample_res[i].neighbor_list = sample_neighbors[i].data();
+    sample_res[i].node_size = sample_nodes[i].size();
+    sample_res[i].neighbor_size = sample_neighbors[i].size();
+  }
+  pthread_rwlock_unlock(rw_lock);
+  if (this->status == GraphSamplerStatus::terminating) {
+    return 0;
+  }
+  callback(sample_res);
+  return 0;
+}
+void CompleteGraphSampler::init(size_t gpu_num, GraphTable *graph_table,
+                                std::vector<std::string> args) {
+  this->gpu_num = gpu_num;
+  this->graph_table = graph_table;
+}
+
+int BasicBfsGraphSampler::run_graph_sampling() {
+  pthread_rwlock_t *rw_lock = graph_table->rw_lock.get();
+  pthread_rwlock_rdlock(rw_lock);
+  while (rounds > 0 && status == GraphSamplerStatus::running) {
+    for (size_t i = 0; i < sample_neighbors_map.size(); i++) {
+      sample_neighbors_map[i].clear();
+    }
+    sample_neighbors_map.clear();
+    std::vector<int> nodes_left(graph_table->shards.size(),
+                                node_num_for_each_shard);
+    std::promise<int> prom;
+    std::future<int> fut = prom.get_future();
+    sample_neighbors_map.resize(graph_table->task_pool_size_);
+    int task_size = 0;
+    std::vector<std::future<int>> tasks;
+    int init_size = 0;
+    //__sync_fetch_and_add
+    std::function<int(int, int64_t)> bfs = [&, this](int i, int id) -> int {
+      VLOG(0) << "in bfs " << i << " " << id;
+      if (this->status == GraphSamplerStatus::terminating) {
+        int task_left = __sync_sub_and_fetch(&task_size, 1);
+        if (task_left == 0) {
+          prom.set_value(0);
+        }
+        return 0;
+      }
+      size_t ind = i % this->graph_table->task_pool_size_;
+      if (nodes_left[i] > 0) {
+        nodes_left[i]--;
+        auto iter = sample_neighbors_map[ind].find(id);
+        if (iter == sample_neighbors_map[ind].end()) {
+          sample_neighbors_map[ind][id] = std::vector<int64_t>();
+          iter = sample_neighbors_map[ind].find(id);
+          Node *node = graph_table->shards[i]->find_node(id);
+          if (node != NULL) {
+            size_t edge_fetch_size =
+                std::min((size_t) this->edge_num_for_each_node,
+                         node->get_neighbor_size());
+            for (size_t k = 0; k < edge_fetch_size; k++) {
+              int64_t neighbor_id = node->get_neighbor_id(k);
+              int node_location = neighbor_id % this->graph_table->shard_num %
+                                  this->graph_table->task_pool_size_;
+              __sync_add_and_fetch(&task_size, 1);
+              graph_table->_shards_task_pool[node_location]->enqueue(
+                  bfs, neighbor_id % this->graph_table->shard_num, neighbor_id);
+              iter->second.push_back(neighbor_id);
+            }
+          }
+        }
+      }
+      int task_left = __sync_sub_and_fetch(&task_size, 1);
+      if (task_left == 0) {
+        prom.set_value(0);
+      }
+      return 0;
+    };
+    for (size_t i = 0; i < graph_table->shards.size(); ++i) {
+      std::vector<Node *> &v = graph_table->shards[i]->get_bucket();
+      if (v.size() > 0) {
+        init_size++;
+        __sync_add_and_fetch(&task_size, 1);
+        int64_t id = v[0]->get_id();
+        graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
+            ->enqueue(bfs, i, id);
+      }  // if
+    }
+    if (init_size == 0) {
+      prom.set_value(0);
+    }
+    fut.get();
+    if (this->status == GraphSamplerStatus::terminating) {
+      pthread_rwlock_unlock(rw_lock);
+      return 0;
+    }
+    std::cout << "bfs over" << std::endl;
+    sample_nodes.clear();
+    sample_neighbors.clear();
+    sample_res.clear();
+    sample_nodes.resize(gpu_num);
+    sample_neighbors.resize(gpu_num);
+    sample_res.resize(gpu_num);
+    std::vector<std::vector<std::vector<paddle::framework::GpuPsGraphNode>>>
+        sample_nodes_ex(graph_table->task_pool_size_);
+    std::vector<std::vector<std::vector<int64_t>>> sample_neighbors_ex(
+        graph_table->task_pool_size_);
+    for (int i = 0; i < graph_table->task_pool_size_; i++) {
+      sample_nodes_ex[i].resize(gpu_num);
+      sample_neighbors_ex[i].resize(gpu_num);
+    }
+    tasks.clear();
+    for (size_t i = 0; i < (size_t)graph_table->task_pool_size_; ++i) {
+      tasks.push_back(
+          graph_table->_shards_task_pool[i]->enqueue([&, i, this]() -> int {
+            if (this->status == GraphSamplerStatus::terminating) {
+              return 0;
+            }
+            paddle::framework::GpuPsGraphNode node;
+            auto iter = sample_neighbors_map[i].begin();
+            size_t ind = i;
+            for (; iter != sample_neighbors_map[i].end(); iter++) {
+              size_t location = iter->first % this->gpu_num;
+              node.node_id = iter->first;
+              node.neighbor_size = iter->second.size();
+              node.neighbor_offset =
+                  (int)sample_neighbors_ex[ind][location].size();
+              sample_nodes_ex[ind][location].emplace_back(node);
+              for (auto k : iter->second)
+                sample_neighbors_ex[ind][location].push_back(k);
+            }
+            return 0;
+          }));
+    }
+
+    for (size_t i = 0; i < tasks.size(); i++) {
+      tasks[i].get();
+      sample_neighbors_map[i].clear();
+    }
+    tasks.clear();
+    if (this->status == GraphSamplerStatus::terminating) {
+      pthread_rwlock_unlock(rw_lock);
+      return 0;
+    }
+    for (size_t i = 0; i < gpu_num; i++) {
+      tasks.push_back(
+          graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
+              ->enqueue([&, i, this]() -> int {
+                if (this->status == GraphSamplerStatus::terminating) {
+                  pthread_rwlock_unlock(rw_lock);
+                  return 0;
+                }
+                int total_offset = 0;
+                size_t ind = i % graph_table->task_pool_size_;
+                for (int j = 0; j < this->graph_table->task_pool_size_; j++) {
+                  for (size_t k = 0; k < sample_nodes_ex[j][ind].size(); k++) {
+                    sample_nodes[i].push_back(sample_nodes_ex[j][ind][k]);
+                    sample_nodes[i].back().neighbor_offset += total_offset;
+                    // neighbor_offset[i].push_back(total_offset +
+                    // neighbor_offset_ex[j][i][k]);
+                  }
+                  size_t neighbor_size = sample_neighbors_ex[j][ind].size();
+                  total_offset += neighbor_size;
+                  for (size_t k = 0; k < neighbor_size; k++) {
+                    sample_neighbors[ind].push_back(
+                        sample_neighbors_ex[j][ind][k]);
+                  }
+                }
+                return 0;
+              }));
+    }
+    for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+    if (this->status == GraphSamplerStatus::terminating) {
+      pthread_rwlock_unlock(rw_lock);
+      return 0;
+    }
+    // int64_t total_neighbors =
+    // std::accumulate(shard_neighbor_size.begin(),shard_neighbor_size.end(),0);
+    for (size_t i = 0; i < gpu_num; i++) {
+      sample_res[i].node_list = sample_nodes[i].data();
+      sample_res[i].neighbor_list = sample_neighbors[i].data();
+      sample_res[i].node_size = sample_nodes[i].size();
+      sample_res[i].neighbor_size = sample_neighbors[i].size();
+    }
+    pthread_rwlock_unlock(rw_lock);
+    if (this->status == GraphSamplerStatus::terminating) {
+      return 0;
+    }
+    callback(sample_res);
+    rounds--;
+    if (rounds > 0) {
+      for (int i = 0;
+           i < interval && this->status == GraphSamplerStatus::running; i++) {
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+      }
+    }
+  }
+  return 0;
+}
+void BasicBfsGraphSampler::init(size_t gpu_num, GraphTable *graph_table,
+                                std::vector<std::string> args) {
+  this->gpu_num = gpu_num;
+  this->graph_table = graph_table;
+  node_num_for_each_shard = args.size() > 0 ? std::stoi(args[0]) : 10;
+  edge_num_for_each_node = args.size() > 1 ? std::stoi(args[1]) : 10;
+  rounds = args.size() > 2 ? std::stoi(args[2]) : 1;
+  interval = args.size() > 3 ? std::stoi(args[3]) : 60;
+}
+
+#endif
+
 std::vector<Node *> GraphShard::get_batch(int start, int end, int step) {
   if (start < 0) start = 0;
   std::vector<Node *> res;
@@ -38,10 +320,10 @@ std::vector<Node *> GraphShard::get_batch(int start, int end, int step) {
 
 size_t GraphShard::get_size() { return bucket.size(); }
 
-int32_t GraphTable::add_graph_node(std::vector<uint64_t> &id_list,
+int32_t GraphTable::add_graph_node(std::vector<int64_t> &id_list,
                                    std::vector<bool> &is_weight_list) {
   size_t node_size = id_list.size();
-  std::vector<std::vector<std::pair<uint64_t, bool>>> batch(task_pool_size_);
+  std::vector<std::vector<std::pair<int64_t, bool>>> batch(task_pool_size_);
   for (size_t i = 0; i < node_size; i++) {
     size_t shard_id = id_list[i] % shard_num;
     if (shard_id >= shard_end || shard_id < shard_start) {
@@ -65,9 +347,9 @@ int32_t GraphTable::add_graph_node(std::vector<uint64_t> &id_list,
   return 0;
 }
 
-int32_t GraphTable::remove_graph_node(std::vector<uint64_t> &id_list) {
+int32_t GraphTable::remove_graph_node(std::vector<int64_t> &id_list) {
   size_t node_size = id_list.size();
-  std::vector<std::vector<uint64_t>> batch(task_pool_size_);
+  std::vector<std::vector<int64_t>> batch(task_pool_size_);
   for (size_t i = 0; i < node_size; i++) {
     size_t shard_id = id_list[i] % shard_num;
     if (shard_id >= shard_end || shard_id < shard_start) continue;
@@ -98,7 +380,7 @@ void GraphShard::clear() {
 
 GraphShard::~GraphShard() { clear(); }
 
-void GraphShard::delete_node(uint64_t id) {
+void GraphShard::delete_node(int64_t id) {
   auto iter = node_location.find(id);
   if (iter == node_location.end()) return;
   int pos = iter->second;
@@ -110,7 +392,7 @@ void GraphShard::delete_node(uint64_t id) {
   node_location.erase(id);
   bucket.pop_back();
 }
-GraphNode *GraphShard::add_graph_node(uint64_t id) {
+GraphNode *GraphShard::add_graph_node(int64_t id) {
   if (node_location.find(id) == node_location.end()) {
     node_location[id] = bucket.size();
     bucket.push_back(new GraphNode(id));
@@ -126,7 +408,7 @@ GraphNode *GraphShard::add_graph_node(Node *node) {
   }
   return (GraphNode *)bucket[node_location[id]];
 }
-FeatureNode *GraphShard::add_feature_node(uint64_t id) {
+FeatureNode *GraphShard::add_feature_node(int64_t id) {
   if (node_location.find(id) == node_location.end()) {
     node_location[id] = bucket.size();
     bucket.push_back(new FeatureNode(id));
@@ -134,11 +416,11 @@ FeatureNode *GraphShard::add_feature_node(uint64_t id) {
   return (FeatureNode *)bucket[node_location[id]];
 }
 
-void GraphShard::add_neighbor(uint64_t id, uint64_t dst_id, float weight) {
+void GraphShard::add_neighbor(int64_t id, int64_t dst_id, float weight) {
   find_node(id)->add_edge(dst_id, weight);
 }
 
-Node *GraphShard::find_node(uint64_t id) {
+Node *GraphShard::find_node(int64_t id) {
   auto iter = node_location.find(id);
   return iter == node_location.end() ? nullptr : bucket[iter->second];
 }
@@ -185,14 +467,14 @@ int32_t GraphTable::load(const std::string &path, const std::string &param) {
 }
 
 int32_t GraphTable::get_nodes_ids_by_ranges(
-    std::vector<std::pair<int, int>> ranges, std::vector<uint64_t> &res) {
+    std::vector<std::pair<int, int>> ranges, std::vector<int64_t> &res) {
   int start = 0, end, index = 0, total_size = 0;
   res.clear();
-  std::vector<std::future<std::vector<uint64_t>>> tasks;
+  std::vector<std::future<std::vector<int64_t>>> tasks;
   for (size_t i = 0; i < shards.size() && index < (int)ranges.size(); i++) {
     end = total_size + shards[i]->get_size();
     start = total_size;
-    while (start < end && index < ranges.size()) {
+    while (start < end && index < (int)ranges.size()) {
       if (ranges[index].second <= start)
         index++;
       else if (ranges[index].first >= end) {
@@ -204,7 +486,7 @@ int32_t GraphTable::get_nodes_ids_by_ranges(
         first -= total_size;
         second -= total_size;
         tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
-            [this, first, second, i]() -> std::vector<uint64_t> {
+            [this, first, second, i]() -> std::vector<int64_t> {
               return shards[i]->get_ids_by_range(first, second);
             }));
       }
@@ -276,6 +558,9 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
 }
 
 int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
+#ifdef PADDLE_WITH_HETERPS
+  if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
+#endif
   auto paths = paddle::string::split_string<std::string>(path, ";");
   int64_t count = 0;
   std::string sample_type = "random";
@@ -351,6 +636,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
   /*-----------------------
   relocate the duplicate nodes to make them distributed evenly among threads.
 */
+  if (!use_duplicate_nodes) {
+#ifdef PADDLE_WITH_HETERPS
+    if (gpups_mode) pthread_rwlock_unlock(rw_lock.get());
+#endif
+
+    return 0;
+  }
   for (auto &shard : extra_shards) {
     auto bucket = shard->get_bucket();
     for (size_t i = 0; i < bucket.size(); i++) {
@@ -360,13 +652,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
   int size = extra_nodes_to_thread_index.size();
   if (size == 0) return 0;
   std::vector<int> index;
-  for (int i = 0; i < used.size(); i++) index.push_back(i);
+  for (int i = 0; i < (int)used.size(); i++) index.push_back(i);
   sort(index.begin(), index.end(),
        [&](int &a, int &b) { return used[a] < used[b]; });
 
   std::vector<int> alloc(index.size(), 0), has_alloc(index.size(), 0);
   int t = 1, aim = 0, mod = 0;
-  for (; t < used.size(); t++) {
+  for (; t < (int)used.size(); t++) {
     if ((used[index[t]] - used[index[t - 1]]) * t >= size) {
       break;
     } else {
@@ -380,7 +672,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
     if (t - x <= mod) alloc[index[x]]++;
     alloc[index[x]] -= used[index[x]];
   }
-  std::vector<uint64_t> vec[index.size()];
+  std::vector<int64_t> vec[index.size()];
   for (auto p : extra_nodes_to_thread_index) {
     has_alloc[p.second]++;
     vec[p.second].push_back(p.first);
@@ -395,7 +687,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
                      has_alloc[index[right]] - alloc[index[right]]);
     has_alloc[index[left]] += x;
     has_alloc[index[right]] -= x;
-    uint64_t id;
+    int64_t id;
     while (x--) {
       id = vec[index[right]].back();
       vec[index[right]].pop_back();
@@ -424,10 +716,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
     delete extra_shards[i];
     extra_shards[i] = extra_shards_copy[i];
   }
+#ifdef PADDLE_WITH_HETERPS
+  if (gpups_mode) pthread_rwlock_unlock(rw_lock.get());
+#endif
   return 0;
 }
 
-Node *GraphTable::find_node(uint64_t id) {
+Node *GraphTable::find_node(int64_t id) {
   size_t shard_id = id % shard_num;
   if (shard_id >= shard_end || shard_id < shard_start) {
     if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0)
@@ -443,7 +738,7 @@ Node *GraphTable::find_node(uint64_t id) {
   Node *node = shards[index]->find_node(id);
   return node;
 }
-uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
+uint32_t GraphTable::get_thread_pool_index(int64_t node_id) {
   if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0)
     return node_id % shard_num % shard_num_per_server % task_pool_size_;
   size_t src_shard_id = node_id % shard_num;
@@ -456,8 +751,7 @@ uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
   return src_shard_id % shard_num_per_server % task_pool_size_;
 }
 
-uint32_t GraphTable::get_thread_pool_index_by_shard_index(
-    uint64_t shard_index) {
+uint32_t GraphTable::get_thread_pool_index_by_shard_index(int64_t shard_index) {
   return shard_index % shard_num_per_server % task_pool_size_;
 }
 
@@ -484,7 +778,7 @@ int32_t GraphTable::random_sample_nodes(int sample_size,
                                         std::unique_ptr<char[]> &buffer,
                                         int &actual_size) {
   int total_size = 0;
-  for (int i = 0; i < shards.size(); i++) {
+  for (int i = 0; i < (int)shards.size(); i++) {
     total_size += shards[i]->get_size();
   }
   if (sample_size > total_size) sample_size = total_size;
@@ -537,16 +831,16 @@ int32_t GraphTable::random_sample_nodes(int sample_size,
     }
   }
   for (auto &pair : first_half) second_half.push_back(pair);
-  std::vector<uint64_t> res;
+  std::vector<int64_t> res;
   get_nodes_ids_by_ranges(second_half, res);
-  actual_size = res.size() * sizeof(uint64_t);
+  actual_size = res.size() * sizeof(int64_t);
   buffer.reset(new char[actual_size]);
   char *pointer = buffer.get();
   memcpy(pointer, res.data(), actual_size);
   return 0;
 }
 int32_t GraphTable::random_sample_neighbors(
-    uint64_t *node_ids, int sample_size,
+    int64_t *node_ids, int sample_size,
     std::vector<std::shared_ptr<char>> &buffers, std::vector<int> &actual_sizes,
     bool need_weight) {
   size_t node_num = buffers.size();
@@ -560,10 +854,10 @@ int32_t GraphTable::random_sample_neighbors(
     seq_id[index].emplace_back(idx);
     id_list[index].emplace_back(node_ids[idx], sample_size, need_weight);
   }
-  for (int i = 0; i < seq_id.size(); i++) {
+  for (int i = 0; i < (int)seq_id.size(); i++) {
     if (seq_id[i].size() == 0) continue;
     tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int {
-      uint64_t node_id;
+      int64_t node_id;
       std::vector<std::pair<SampleKey, SampleResult>> r;
       LRUResponse response = LRUResponse::blocked;
       if (use_cache) {
@@ -576,7 +870,7 @@ int32_t GraphTable::random_sample_neighbors(
       std::vector<SampleKey> sample_keys;
       auto &rng = _shards_task_rng_pool[i];
       for (size_t k = 0; k < id_list[i].size(); k++) {
-        if (index < r.size() &&
+        if (index < (int)r.size() &&
             r[index].first.node_key == id_list[i][k].node_key) {
           idx = seq_id[i][k];
           actual_sizes[idx] = r[index].second.actual_size;
@@ -597,7 +891,7 @@ int32_t GraphTable::random_sample_neighbors(
               res.size() * (need_weight ? (Node::id_size + Node::weight_size)
                                         : Node::id_size);
           int offset = 0;
-          uint64_t id;
+          int64_t id;
           float weight;
           char *buffer_addr = new char[actual_size];
           if (response == LRUResponse::ok) {
@@ -632,13 +926,13 @@ int32_t GraphTable::random_sample_neighbors(
   return 0;
 }
 
-int32_t GraphTable::get_node_feat(const std::vector<uint64_t> &node_ids,
+int32_t GraphTable::get_node_feat(const std::vector<int64_t> &node_ids,
                                   const std::vector<std::string> &feature_names,
                                   std::vector<std::vector<std::string>> &res) {
   size_t node_num = node_ids.size();
   std::vector<std::future<int>> tasks;
   for (size_t idx = 0; idx < node_num; ++idx) {
-    uint64_t node_id = node_ids[idx];
+    int64_t node_id = node_ids[idx];
     tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
         [&, idx, node_id]() -> int {
           Node *node = find_node(node_id);
@@ -646,7 +940,8 @@ int32_t GraphTable::get_node_feat(const std::vector<uint64_t> &node_ids,
           if (node == nullptr) {
             return 0;
           }
-          for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
+          for (int feat_idx = 0; feat_idx < (int)feature_names.size();
+               ++feat_idx) {
             const std::string &feature_name = feature_names[feat_idx];
             if (feat_id_map.find(feature_name) != feat_id_map.end()) {
               // res[feat_idx][idx] =
@@ -665,19 +960,20 @@ int32_t GraphTable::get_node_feat(const std::vector<uint64_t> &node_ids,
 }
 
 int32_t GraphTable::set_node_feat(
-    const std::vector<uint64_t> &node_ids,
+    const std::vector<int64_t> &node_ids,
     const std::vector<std::string> &feature_names,
     const std::vector<std::vector<std::string>> &res) {
   size_t node_num = node_ids.size();
   std::vector<std::future<int>> tasks;
   for (size_t idx = 0; idx < node_num; ++idx) {
-    uint64_t node_id = node_ids[idx];
+    int64_t node_id = node_ids[idx];
     tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
         [&, idx, node_id]() -> int {
           size_t index = node_id % this->shard_num - this->shard_start;
           auto node = shards[index]->add_feature_node(node_id);
           node->set_feature_size(this->feat_name.size());
-          for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
+          for (int feat_idx = 0; feat_idx < (int)feature_names.size();
+               ++feat_idx) {
             const std::string &feature_name = feature_names[feat_idx];
             if (feat_id_map.find(feature_name) != feat_id_map.end()) {
               node->set_feature(feat_id_map[feature_name], res[feat_idx][idx]);
@@ -771,35 +1067,68 @@ int32_t GraphTable::pull_graph_list(int start, int total_size,
   return 0;
 }
 
-int32_t GraphTable::get_server_index_by_id(uint64_t id) {
+int32_t GraphTable::get_server_index_by_id(int64_t id) {
   return id % shard_num / shard_num_per_server;
 }
+int32_t GraphTable::initialize(const TableParameter &config,
+                               const FsClientParameter &fs_config) {
+  LOG(INFO) << "in graphTable initialize";
+  _config = config;
+  if (initialize_accessor() != 0) {
+    LOG(WARNING) << "Table accessor initialize failed";
+    return -1;
+  }
 
-int32_t GraphTable::initialize() {
+  if (_afs_client.initialize(fs_config) != 0) {
+    LOG(WARNING) << "Table fs_client initialize failed";
+    // return -1;
+  }
+  auto graph = config.graph_parameter();
+  shard_num = _config.shard_num();
+  LOG(INFO) << "in graphTable initialize over";
+  return initialize(graph);
+}
+int32_t GraphTable::initialize(const GraphParameter &graph) {
+#ifdef PADDLE_WITH_HETERPS
+  if (graph.gpups_mode()) {
+    gpups_mode = true;
+    if (shard_num == 0) {
+      shard_num = graph.gpups_mode_shard_num();
+      server_num = 1;
+      _shard_idx = 0;
+    }
+    auto *sampler =
+        CREATE_PSCORE_CLASS(GraphSampler, graph.gpups_graph_sample_class());
+    auto slices =
+        string::split_string<std::string>(graph.gpups_graph_sample_args(), ",");
+    std::cout << "slices" << std::endl;
+    for (auto x : slices) std::cout << x << std::endl;
+    sampler->init(graph.gpu_num(), this, slices);
+    graph_sampler.reset(sampler);
+  }
+#endif
+  task_pool_size_ = graph.task_pool_size();
   _shards_task_pool.resize(task_pool_size_);
   for (size_t i = 0; i < _shards_task_pool.size(); ++i) {
     _shards_task_pool[i].reset(new ::ThreadPool(1));
     _shards_task_rng_pool.push_back(paddle::framework::GetCPURandomEngine(0));
   }
-  server_num = _shard_num;
-  // VLOG(0) << "in init graph table server num = " << server_num;
-  /*
-  _shard_num is actually server number here
-  when a server initialize its tables, it sets tables' _shard_num to server_num,
-  and _shard_idx to server
-  rank
-  */
-  auto common = _config.common();
-
-  this->table_name = common.table_name();
-  this->table_type = common.name();
+  auto graph_feature = graph.graph_feature();
+  // this->table_name = common.table_name();
+  // this->table_type = common.name();
+  this->table_name = graph.table_name();
+  this->table_type = graph.table_type();
   VLOG(0) << " init graph table type " << this->table_type << " table name "
           << this->table_name;
-  int feat_conf_size = static_cast<int>(common.attributes().size());
+  // int feat_conf_size = static_cast<int>(common.attributes().size());
+  int feat_conf_size = static_cast<int>(graph_feature.name().size());
   for (int i = 0; i < feat_conf_size; i++) {
-    auto &f_name = common.attributes()[i];
-    auto &f_shape = common.dims()[i];
-    auto &f_dtype = common.params()[i];
+    // auto &f_name = common.attributes()[i];
+    // auto &f_shape = common.dims()[i];
+    // auto &f_dtype = common.params()[i];
+    auto &f_name = graph_feature.name()[i];
+    auto &f_shape = graph_feature.shape()[i];
+    auto &f_dtype = graph_feature.dtype()[i];
     this->feat_name.push_back(f_name);
     this->feat_shape.push_back(f_shape);
     this->feat_dtype.push_back(f_dtype);
@@ -807,8 +1136,6 @@ int32_t GraphTable::initialize() {
     VLOG(0) << "init graph table feat conf name:" << f_name
             << " shape:" << f_shape << " dtype:" << f_dtype;
   }
-
-  shard_num = _config.shard_num();
   VLOG(0) << "in init graph table shard num = " << shard_num << " shard_idx"
           << _shard_idx;
   shard_num_per_server = sparse_local_shard_num(shard_num, server_num);
@@ -826,5 +1153,6 @@ int32_t GraphTable::initialize() {
 
   return 0;
 }
+
 }  // namespace distributed
 };  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h
index c76a62248c8fc..f6f127621b947 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.h
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.h
@@ -38,10 +38,14 @@
 #include <vector>
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/common_table.h"
+#include "paddle/fluid/distributed/ps/table/graph/class_macro.h"
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/phi/core/utils/rw_lock.h"
 
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
+#endif
 namespace paddle {
 namespace distributed {
 class GraphShard {
@@ -51,37 +55,37 @@ class GraphShard {
   ~GraphShard();
   std::vector<Node *> &get_bucket() { return bucket; }
   std::vector<Node *> get_batch(int start, int end, int step);
-  std::vector<uint64_t> get_ids_by_range(int start, int end) {
-    std::vector<uint64_t> res;
+  std::vector<int64_t> get_ids_by_range(int start, int end) {
+    std::vector<int64_t> res;
     for (int i = start; i < end && i < (int)bucket.size(); i++) {
       res.push_back(bucket[i]->get_id());
     }
     return res;
   }
 
-  GraphNode *add_graph_node(uint64_t id);
+  GraphNode *add_graph_node(int64_t id);
   GraphNode *add_graph_node(Node *node);
-  FeatureNode *add_feature_node(uint64_t id);
-  Node *find_node(uint64_t id);
-  void delete_node(uint64_t id);
+  FeatureNode *add_feature_node(int64_t id);
+  Node *find_node(int64_t id);
+  void delete_node(int64_t id);
   void clear();
-  void add_neighbor(uint64_t id, uint64_t dst_id, float weight);
-  std::unordered_map<uint64_t, int> &get_node_location() {
+  void add_neighbor(int64_t id, int64_t dst_id, float weight);
+  std::unordered_map<int64_t, int> &get_node_location() {
     return node_location;
   }
 
  private:
-  std::unordered_map<uint64_t, int> node_location;
+  std::unordered_map<int64_t, int> node_location;
   std::vector<Node *> bucket;
 };
 
 enum LRUResponse { ok = 0, blocked = 1, err = 2 };
 
 struct SampleKey {
-  uint64_t node_key;
+  int64_t node_key;
   size_t sample_size;
   bool is_weighted;
-  SampleKey(uint64_t _node_key, size_t _sample_size, bool _is_weighted)
+  SampleKey(int64_t _node_key, size_t _sample_size, bool _is_weighted)
       : node_key(_node_key),
         sample_size(_sample_size),
         is_weighted(_is_weighted) {}
@@ -300,7 +304,7 @@ class ScaledLRU {
       node_size += lru_pool[i].node_size - lru_pool[i].remove_count;
     }
 
-    if (node_size <= size_t(1.1 * size_limit) + 1) return 0;
+    if ((size_t)node_size <= size_t(1.1 * size_limit) + 1) return 0;
     if (pthread_rwlock_wrlock(&rwlock) == 0) {
       // VLOG(0)<"in shrink\n";
       global_count = 0;
@@ -308,9 +312,9 @@ class ScaledLRU {
         global_count += lru_pool[i].node_size - lru_pool[i].remove_count;
       }
       // VLOG(0)<<"global_count "<<global_count<<"\n";
-      if (global_count > size_limit) {
+      if ((size_t)global_count > size_limit) {
         size_t remove = global_count - size_limit;
-        for (int i = 0; i < lru_pool.size(); i++) {
+        for (size_t i = 0; i < lru_pool.size(); i++) {
           lru_pool[i].total_diff = 0;
           lru_pool[i].remove_count +=
               1.0 * (lru_pool[i].node_size - lru_pool[i].remove_count) /
@@ -352,9 +356,69 @@ class ScaledLRU {
   friend class RandomSampleLRU<K, V>;
 };
 
+#ifdef PADDLE_WITH_HETERPS
+enum GraphSamplerStatus { waiting = 0, running = 1, terminating = 2 };
+class GraphTable;
+class GraphSampler {
+ public:
+  GraphSampler() {
+    status = GraphSamplerStatus::waiting;
+    thread_pool.reset(new ::ThreadPool(1));
+    callback = [](std::vector<paddle::framework::GpuPsCommGraph> &res) {
+      return;
+    };
+  }
+  virtual int run_graph_sampling() = 0;
+  virtual int start_graph_sampling() {
+    if (status != GraphSamplerStatus::waiting) {
+      return -1;
+    }
+    std::promise<int> prom;
+    std::future<int> fut = prom.get_future();
+    graph_sample_task_over = thread_pool->enqueue([&prom, this]() {
+      prom.set_value(0);
+      status = GraphSamplerStatus::running;
+      return run_graph_sampling();
+    });
+    return fut.get();
+  }
+  virtual void init(size_t gpu_num, GraphTable *graph_table,
+                    std::vector<std::string> args) = 0;
+  virtual void set_graph_sample_callback(
+      std::function<void(std::vector<paddle::framework::GpuPsCommGraph> &)>
+          callback) {
+    this->callback = callback;
+  }
+
+  virtual int end_graph_sampling() {
+    if (status == GraphSamplerStatus::running) {
+      status = GraphSamplerStatus::terminating;
+      return graph_sample_task_over.get();
+    }
+    return -1;
+  }
+  virtual GraphSamplerStatus get_graph_sampler_status() { return status; }
+
+ protected:
+  std::function<void(std::vector<paddle::framework::GpuPsCommGraph> &)>
+      callback;
+  std::shared_ptr<::ThreadPool> thread_pool;
+  GraphSamplerStatus status;
+  std::future<int> graph_sample_task_over;
+  std::vector<paddle::framework::GpuPsCommGraph> sample_res;
+};
+#endif
+
 class GraphTable : public SparseTable {
  public:
-  GraphTable() { use_cache = false; }
+  GraphTable() {
+    use_cache = false;
+    shard_num = 0;
+#ifdef PADDLE_WITH_HETERPS
+    gpups_mode = false;
+#endif
+    rw_lock.reset(new pthread_rwlock_t());
+  }
   virtual ~GraphTable();
   virtual int32_t pull_graph_list(int start, int size,
                                   std::unique_ptr<char[]> &buffer,
@@ -362,7 +426,7 @@ class GraphTable : public SparseTable {
                                   int step);
 
   virtual int32_t random_sample_neighbors(
-      uint64_t *node_ids, int sample_size,
+      int64_t *node_ids, int sample_size,
       std::vector<std::shared_ptr<char>> &buffers,
       std::vector<int> &actual_sizes, bool need_weight);
 
@@ -370,9 +434,11 @@ class GraphTable : public SparseTable {
                               int &actual_sizes);
 
   virtual int32_t get_nodes_ids_by_ranges(
-      std::vector<std::pair<int, int>> ranges, std::vector<uint64_t> &res);
-  virtual int32_t initialize();
-
+      std::vector<std::pair<int, int>> ranges, std::vector<int64_t> &res);
+  virtual int32_t initialize() { return 0; }
+  virtual int32_t initialize(const TableParameter &config,
+                             const FsClientParameter &fs_config);
+  virtual int32_t initialize(const GraphParameter &config);
   int32_t load(const std::string &path, const std::string &param);
   int32_t load_graph_split_config(const std::string &path);
 
@@ -380,13 +446,16 @@ class GraphTable : public SparseTable {
 
   int32_t load_nodes(const std::string &path, std::string node_type);
 
-  int32_t add_graph_node(std::vector<uint64_t> &id_list,
+  int32_t add_graph_node(std::vector<int64_t> &id_list,
                          std::vector<bool> &is_weight_list);
 
-  int32_t remove_graph_node(std::vector<uint64_t> &id_list);
+  int32_t remove_graph_node(std::vector<int64_t> &id_list);
+
+  int32_t get_server_index_by_id(int64_t id);
+  Node *find_node(int64_t id);
 
-  int32_t get_server_index_by_id(uint64_t id);
-  Node *find_node(uint64_t id);
+  virtual int32_t Pull(TableContext &context) { return 0; }
+  virtual int32_t Push(TableContext &context) { return 0; }
 
   virtual int32_t pull_sparse(float *values,
                               const PullSparseValue &pull_value) {
@@ -407,16 +476,27 @@ class GraphTable : public SparseTable {
     return 0;
   }
   virtual int32_t initialize_shard() { return 0; }
-  virtual uint32_t get_thread_pool_index_by_shard_index(uint64_t shard_index);
-  virtual uint32_t get_thread_pool_index(uint64_t node_id);
+  virtual int32_t set_shard(size_t shard_idx, size_t server_num) {
+    _shard_idx = shard_idx;
+    /*
+    _shard_num is not used in graph_table, this following operation is for the
+    purpose of
+    being compatible with base class table.
+    */
+    _shard_num = server_num;
+    this->server_num = server_num;
+    return 0;
+  }
+  virtual uint32_t get_thread_pool_index_by_shard_index(int64_t shard_index);
+  virtual uint32_t get_thread_pool_index(int64_t node_id);
   virtual std::pair<int32_t, std::string> parse_feature(std::string feat_str);
 
-  virtual int32_t get_node_feat(const std::vector<uint64_t> &node_ids,
+  virtual int32_t get_node_feat(const std::vector<int64_t> &node_ids,
                                 const std::vector<std::string> &feature_names,
                                 std::vector<std::vector<std::string>> &res);
 
   virtual int32_t set_node_feat(
-      const std::vector<uint64_t> &node_ids,
+      const std::vector<int64_t> &node_ids,
       const std::vector<std::string> &feature_names,
       const std::vector<std::vector<std::string>> &res);
 
@@ -433,11 +513,25 @@ class GraphTable : public SparseTable {
     }
     return 0;
   }
-
+#ifdef PADDLE_WITH_HETERPS
+  virtual int32_t start_graph_sampling() {
+    return this->graph_sampler->start_graph_sampling();
+  }
+  virtual int32_t end_graph_sampling() {
+    return this->graph_sampler->end_graph_sampling();
+  }
+  virtual int32_t set_graph_sample_callback(
+      std::function<void(std::vector<paddle::framework::GpuPsCommGraph> &)>
+          callback) {
+    graph_sampler->set_graph_sample_callback(callback);
+    return 0;
+  }
+// virtual GraphSampler *get_graph_sampler() { return graph_sampler.get(); }
+#endif
  protected:
   std::vector<GraphShard *> shards, extra_shards;
   size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num;
-  const int task_pool_size_ = 24;
+  int task_pool_size_ = 24;
   const int random_sample_nodes_ranges = 3;
 
   std::vector<std::string> feat_name;
@@ -450,11 +544,61 @@ class GraphTable : public SparseTable {
   std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
   std::vector<std::shared_ptr<std::mt19937_64>> _shards_task_rng_pool;
   std::shared_ptr<ScaledLRU<SampleKey, SampleResult>> scaled_lru;
-  std::unordered_set<uint64_t> extra_nodes;
-  std::unordered_map<uint64_t, size_t> extra_nodes_to_thread_index;
+  std::unordered_set<int64_t> extra_nodes;
+  std::unordered_map<int64_t, size_t> extra_nodes_to_thread_index;
   bool use_cache, use_duplicate_nodes;
   mutable std::mutex mutex_;
+  std::shared_ptr<pthread_rwlock_t> rw_lock;
+#ifdef PADDLE_WITH_HETERPS
+  // paddle::framework::GpuPsGraphTable gpu_graph_table;
+  bool gpups_mode;
+  // std::shared_ptr<::ThreadPool> graph_sample_pool;
+  std::shared_ptr<GraphSampler> graph_sampler;
+  REGISTER_GRAPH_FRIEND_CLASS(2, CompleteGraphSampler, BasicBfsGraphSampler)
+#endif
+};
+
+#ifdef PADDLE_WITH_HETERPS
+REGISTER_PSCORE_REGISTERER(GraphSampler);
+class CompleteGraphSampler : public GraphSampler {
+ public:
+  CompleteGraphSampler() {}
+  ~CompleteGraphSampler() {}
+  // virtual pthread_rwlock_t *export_rw_lock();
+  virtual int run_graph_sampling();
+  virtual void init(size_t gpu_num, GraphTable *graph_table,
+                    std::vector<std::string> args_);
+
+ protected:
+  GraphTable *graph_table;
+  std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
+  std::vector<std::vector<int64_t>> sample_neighbors;
+  // std::vector<GpuPsCommGraph> sample_res;
+  // std::shared_ptr<std::mt19937_64> random;
+  int gpu_num;
+};
+
+class BasicBfsGraphSampler : public GraphSampler {
+ public:
+  BasicBfsGraphSampler() {}
+  ~BasicBfsGraphSampler() {}
+  // virtual pthread_rwlock_t *export_rw_lock();
+  virtual int run_graph_sampling();
+  virtual void init(size_t gpu_num, GraphTable *graph_table,
+                    std::vector<std::string> args_);
+
+ protected:
+  GraphTable *graph_table;
+  // std::vector<std::vector<GpuPsGraphNode>> sample_nodes;
+  std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
+  std::vector<std::vector<int64_t>> sample_neighbors;
+  size_t gpu_num;
+  int node_num_for_each_shard, edge_num_for_each_node;
+  int rounds, interval;
+  std::vector<std::unordered_map<int64_t, std::vector<int64_t>>>
+      sample_neighbors_map;
 };
+#endif
 }  // namespace distributed
 
 };  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/common_sparse_table.cc b/paddle/fluid/distributed/ps/table/common_sparse_table.cc
index b44d08b937a96..45be53335e1a1 100644
--- a/paddle/fluid/distributed/ps/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_sparse_table.cc
@@ -355,6 +355,32 @@ int32_t CommonSparseTable::pour() {
   return 0;
 }
 
+int32_t CommonSparseTable::Pull(TableContext& context) {
+  CHECK(context.value_type == Sparse);
+  if (context.use_ptr) {
+    char** pull_values = context.pull_context.ptr_values;
+    const uint64_t* keys = context.pull_context.keys;
+    return pull_sparse_ptr(pull_values, keys, context.num);
+  } else {
+    float* pull_values = context.pull_context.values;
+    const PullSparseValue& pull_value = context.pull_context.pull_value;
+    return pull_sparse(pull_values, pull_value);
+  }
+}
+
+int32_t CommonSparseTable::Push(TableContext& context) {
+  CHECK(context.value_type == Sparse);
+  if (context.pull_context.values != nullptr) {
+    const float* values = context.push_context.values;
+    const uint64_t* keys = context.push_context.keys;
+    return push_sparse(keys, values, context.num);
+  } else {
+    const float** values = context.push_context.ptr_values;
+    const uint64_t* keys = context.push_context.keys;
+    return push_sparse(keys, values, context.num);
+  }
+}
+
 int32_t CommonSparseTable::pull_sparse(float* pull_values,
                                        const PullSparseValue& pull_value) {
   auto shard_num = task_pool_size_;
diff --git a/paddle/fluid/distributed/ps/table/common_sparse_table.h b/paddle/fluid/distributed/ps/table/common_sparse_table.h
index 82481dcd584e4..138c544742066 100644
--- a/paddle/fluid/distributed/ps/table/common_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/common_sparse_table.h
@@ -121,6 +121,9 @@ class CommonSparseTable : public SparseTable {
   virtual int32_t push_dense(const float* values, size_t num) { return 0; }
   // unused method end
 
+  virtual int32_t Pull(TableContext& context);
+  virtual int32_t Push(TableContext& context);
+
   virtual int32_t initialize();
   virtual int32_t initialize_shard() { return 0; }
   virtual int32_t initialize_value();
diff --git a/paddle/fluid/distributed/ps/table/common_table.h b/paddle/fluid/distributed/ps/table/common_table.h
index bac826dfe0e20..3d291c0152246 100644
--- a/paddle/fluid/distributed/ps/table/common_table.h
+++ b/paddle/fluid/distributed/ps/table/common_table.h
@@ -119,6 +119,9 @@ class BarrierTable : public Table {
 
   virtual void *get_shard(size_t shard_idx) { return 0; }
 
+  virtual int32_t Pull(TableContext &context) { return 0; }
+  virtual int32_t Push(TableContext &context) { return 0; }
+
   int32_t pull_dense(float *values, size_t num) override { return 0; }
 
   int32_t push_dense(const float *values, size_t num) override { return 0; }
diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
index 866bd8114ccea..43e143dca901b 100644
--- a/paddle/fluid/distributed/ps/table/ctr_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
@@ -38,6 +38,16 @@ int CtrCommonAccessor::initialize() {
   return 0;
 }
 
+void CtrCommonAccessor::GetTableInfo(AccessorInfo& info) {
+  info.dim = dim();
+  info.size = size();
+  info.select_dim = select_dim();
+  info.select_size = select_size();
+  info.update_dim = update_dim();
+  info.update_size = update_size();
+  info.fea_dim = fea_dim();
+}
+
 size_t CtrCommonAccessor::dim() { return common_feature_value.dim(); }
 
 size_t CtrCommonAccessor::dim_size(size_t dim) {
diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.h b/paddle/fluid/distributed/ps/table/ctr_accessor.h
index 1e31fec04649b..bc46217955a8a 100644
--- a/paddle/fluid/distributed/ps/table/ctr_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_accessor.h
@@ -126,6 +126,7 @@ class CtrCommonAccessor : public ValueAccessor {
   virtual int initialize();
   virtual ~CtrCommonAccessor() {}
 
+  virtual void GetTableInfo(AccessorInfo& info);
   // value维度
   virtual size_t dim();
   // value各个维度的size
diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
index b07bcf70ad7af..bccf1fdebafa0 100644
--- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
@@ -37,6 +37,16 @@ int DownpourCtrDoubleAccessor::initialize() {
   return 0;
 }
 
+void DownpourCtrDoubleAccessor::GetTableInfo(AccessorInfo& info) {
+  info.dim = dim();
+  info.size = size();
+  info.select_dim = select_dim();
+  info.select_size = select_size();
+  info.update_dim = update_dim();
+  info.update_size = update_size();
+  info.fea_dim = fea_dim();
+}
+
 size_t DownpourCtrDoubleAccessor::dim() {
   auto embedx_dim = _config.embedx_dim();
   return DownpourCtrDoubleFeatureValue::dim(embedx_dim);
diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
index d7c717ace0988..d7942634e8600 100644
--- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
@@ -168,6 +168,7 @@ class DownpourCtrDoubleAccessor : public ValueAccessor {
   DownpourCtrDoubleAccessor() {}
   virtual ~DownpourCtrDoubleAccessor() {}
   virtual int initialize();
+  virtual void GetTableInfo(AccessorInfo& info);
   // value维度
   virtual size_t dim();
   // value各个维度的size
diff --git a/paddle/fluid/distributed/ps/table/depends/sparse_utils.h b/paddle/fluid/distributed/ps/table/depends/sparse_utils.h
index 708f7786bf3b0..98e0250acc4d6 100644
--- a/paddle/fluid/distributed/ps/table/depends/sparse_utils.h
+++ b/paddle/fluid/distributed/ps/table/depends/sparse_utils.h
@@ -58,7 +58,7 @@ struct PullSparseValue {
                std::vector<int>* offset_shard) const {
     offset_shard->reserve(numel_ / shard_num + 1);
     for (int x = 0; x < numel_; ++x) {
-      if (feasigns_[x] % shard_num == shard_id) {
+      if (int(feasigns_[x] % shard_num) == shard_id) {
         offset_shard->push_back(x);
       }
     }
diff --git a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc
index 5f22c3a436f1f..e8ca7430351de 100644
--- a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc
@@ -37,6 +37,16 @@ int DownpourCtrAccessor::initialize() {
   return 0;
 }
 
+void DownpourCtrAccessor::GetTableInfo(AccessorInfo& info) {
+  info.dim = dim();
+  info.size = size();
+  info.select_dim = select_dim();
+  info.select_size = select_size();
+  info.update_dim = update_dim();
+  info.update_size = update_size();
+  info.fea_dim = fea_dim();
+}
+
 size_t DownpourCtrAccessor::dim() {
   auto embedx_dim = _config.embedx_dim();
   return DownpourCtrFeatureValue::dim(embedx_dim);
diff --git a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h
index 5de7b12e01f0d..11991ad044ff6 100644
--- a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h
+++ b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h
@@ -160,6 +160,7 @@ class DownpourCtrAccessor : public ValueAccessor {
   virtual ~DownpourCtrAccessor() {}
 
   virtual int initialize();
+  virtual void GetTableInfo(AccessorInfo& info);
   // value维度
   virtual size_t dim();
   // value各个维度的size
diff --git a/paddle/fluid/distributed/ps/table/graph/class_macro.h b/paddle/fluid/distributed/ps/table/graph/class_macro.h
new file mode 100644
index 0000000000000..bf59dbacb2537
--- /dev/null
+++ b/paddle/fluid/distributed/ps/table/graph/class_macro.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#define DECLARE_GRAPH_FRIEND_CLASS(a) friend class a;
+#define DECLARE_1_FRIEND_CLASS(a, ...) DECLARE_GRAPH_FRIEND_CLASS(a)
+#define DECLARE_2_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_1_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_3_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_2_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_4_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_3_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_5_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_4_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_6_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_5_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_7_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_6_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_8_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_7_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_9_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_8_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_10_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_9_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_11_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_10_FRIEND_CLASS(__VA_ARGS__)
+#define REGISTER_GRAPH_FRIEND_CLASS(n, ...) \
+  DECLARE_##n##_FRIEND_CLASS(__VA_ARGS__)
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
index d1961b655d882..004a536e8e56c 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
+++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
@@ -17,11 +17,11 @@
 namespace paddle {
 namespace distributed {
 
-void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+void GraphEdgeBlob::add_edge(int64_t id, float weight = 1) {
   id_arr.push_back(id);
 }
 
-void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+void WeightedGraphEdgeBlob::add_edge(int64_t id, float weight = 1) {
   id_arr.push_back(id);
   weight_arr.push_back(weight);
 }
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.h b/paddle/fluid/distributed/ps/table/graph/graph_edge.h
index 3dfe5a6f357a7..5fc785fe25682 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_edge.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.h
@@ -24,19 +24,20 @@ class GraphEdgeBlob {
   GraphEdgeBlob() {}
   virtual ~GraphEdgeBlob() {}
   size_t size() { return id_arr.size(); }
-  virtual void add_edge(uint64_t id, float weight);
-  uint64_t get_id(int idx) { return id_arr[idx]; }
+  virtual void add_edge(int64_t id, float weight);
+  int64_t get_id(int idx) { return id_arr[idx]; }
   virtual float get_weight(int idx) { return 1; }
+  std::vector<int64_t>& export_id_array() { return id_arr; }
 
  protected:
-  std::vector<uint64_t> id_arr;
+  std::vector<int64_t> id_arr;
 };
 
 class WeightedGraphEdgeBlob : public GraphEdgeBlob {
  public:
   WeightedGraphEdgeBlob() {}
   virtual ~WeightedGraphEdgeBlob() {}
-  virtual void add_edge(uint64_t id, float weight);
+  virtual void add_edge(int64_t id, float weight);
   virtual float get_weight(int idx) { return weight_arr[idx]; }
 
  protected:
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.h b/paddle/fluid/distributed/ps/table/graph/graph_node.h
index b838c2c1258d8..c6c594036d4fc 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_node.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h
@@ -48,6 +48,7 @@ class Node {
   virtual void set_feature(int idx, std::string str) {}
   virtual void set_feature_size(int size) {}
   virtual int get_feature_size() { return 0; }
+  virtual size_t get_neighbor_size() { return 0; }
 
  protected:
   uint64_t id;
@@ -70,6 +71,7 @@ class GraphNode : public Node {
   }
   virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); }
   virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); }
+  virtual size_t get_neighbor_size() { return edges->size(); }
 
  protected:
   Sampler *sampler;
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h
index 89c4fc15ae279..3b43f99543fdd 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h
@@ -48,6 +48,8 @@ class MemorySparseGeoTable : public SparseTable {
   virtual int32_t save(const std::string& path, const std::string& param) {
     return 0;
   }
+  virtual int32_t Pull(TableContext& context) { return 0; }
+  virtual int32_t Push(TableContext& context) { return 0; }
   virtual int32_t flush() { return 0; }
   virtual int32_t shrink(const std::string& param) { return 0; }
   virtual void clear() { return; }
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
index 7ce6e9005cf56..98454ca747d31 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
@@ -390,6 +390,26 @@ std::pair<int64_t, int64_t> MemorySparseTable::print_table_stat() {
   return {feasign_size, mf_size};
 }
 
+int32_t MemorySparseTable::Pull(TableContext& context) {
+  CHECK(context.value_type == Sparse);
+  if (context.use_ptr) {
+    char** pull_values = context.pull_context.ptr_values;
+    const uint64_t* keys = context.pull_context.keys;
+    return pull_sparse_ptr(pull_values, keys, context.num);
+  } else {
+    float* pull_values = context.pull_context.values;
+    const PullSparseValue& pull_value = context.pull_context.pull_value;
+    return pull_sparse(pull_values, pull_value);
+  }
+}
+
+int32_t MemorySparseTable::Push(TableContext& context) {
+  CHECK(context.value_type == Sparse);
+
+  const uint64_t* keys = context.push_context.keys;
+  return push_sparse(keys, context.push_context.ptr_values, context.num);
+}
+
 int32_t MemorySparseTable::pull_sparse(float* pull_values,
                                        const PullSparseValue& pull_value) {
   CostTimer timer("pserver_sparse_select_all");
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
index 5770f25f8f41d..d26c67319760d 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
@@ -48,6 +48,9 @@ class MemorySparseTable : public SparseTable {
   virtual int32_t push_dense(const float* values, size_t num) { return 0; }
   // unused method end
 
+  virtual int32_t Pull(TableContext& context);
+  virtual int32_t Push(TableContext& context);
+
   virtual int32_t initialize();
   virtual int32_t initialize_shard() { return 0; }
   virtual int32_t initialize_value();
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
index 60514b4e19ffa..5bc58bc5a1108 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
@@ -61,6 +61,21 @@ int32_t SSDSparseTable::initialize() {
   return 0;
 }
 
+int32_t SSDSparseTable::Pull(TableContext& context) {
+  CHECK(context.value_type == Sparse);
+  if (context.use_ptr) {
+    char** pull_values = context.pull_context.ptr_values;
+    const uint64_t* keys = context.pull_context.keys;
+    return pull_sparse_ptr(pull_values, keys, context.num);
+  } else {
+    float* pull_values = context.pull_context.values;
+    const PullSparseValue& pull_value = context.pull_context.pull_value;
+    return pull_sparse(pull_values, pull_value);
+  }
+}
+
+int32_t SSDSparseTable::Push(TableContext& context) { return 0; }
+
 int32_t SSDSparseTable::pull_sparse(float* pull_values,
                                     const PullSparseValue& pull_value) {
   auto shard_num = task_pool_size_;
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
index f5e8a7067e0e0..3a703d7d966d3 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
@@ -42,6 +42,9 @@ class SSDSparseTable : public CommonSparseTable {
   // exchange data
   virtual int32_t update_table();
 
+  virtual int32_t Pull(TableContext& context);
+  virtual int32_t Push(TableContext& context);
+
   virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value);
 
   virtual int32_t pull_sparse_ptr(char** pull_values, const uint64_t* keys,
diff --git a/paddle/fluid/distributed/ps/table/table.cc b/paddle/fluid/distributed/ps/table/table.cc
index fa8169da07ab7..fc2ea56e95d77 100644
--- a/paddle/fluid/distributed/ps/table/table.cc
+++ b/paddle/fluid/distributed/ps/table/table.cc
@@ -37,6 +37,8 @@ REGISTER_PSCORE_CLASS(Table, CommonDenseTable);
 REGISTER_PSCORE_CLASS(Table, CommonSparseTable);
 #ifdef PADDLE_WITH_HETERPS
 REGISTER_PSCORE_CLASS(Table, SSDSparseTable);
+REGISTER_PSCORE_CLASS(GraphSampler, CompleteGraphSampler);
+REGISTER_PSCORE_CLASS(GraphSampler, BasicBfsGraphSampler);
 #endif
 REGISTER_PSCORE_CLASS(Table, SparseGeoTable);
 REGISTER_PSCORE_CLASS(Table, BarrierTable);
diff --git a/paddle/fluid/distributed/ps/table/table.h b/paddle/fluid/distributed/ps/table/table.h
index da1bb668ccfa3..2bd2a42b6c58f 100644
--- a/paddle/fluid/distributed/ps/table/table.h
+++ b/paddle/fluid/distributed/ps/table/table.h
@@ -32,6 +32,30 @@
 
 namespace paddle {
 namespace distributed {
+
+enum ValueType { Sparse = 0, Dense = 1 };
+
+struct PullContext {
+  const uint64_t *keys;
+  const PullSparseValue pull_value;
+  float *values;
+  char **ptr_values;
+};
+
+struct TablePushContext {
+  const uint64_t *keys;
+  const float *values;
+  const float **ptr_values;
+};
+
+struct TableContext {
+  ValueType value_type;
+  PullContext pull_context;
+  TablePushContext push_context;
+  size_t num;
+  bool use_ptr;
+};
+
 class Table {
  public:
   Table() {}
@@ -39,6 +63,8 @@ class Table {
   virtual int32_t initialize(const TableParameter &config,
                              const FsClientParameter &fs_config);
 
+  virtual int32_t Pull(TableContext &context) = 0;
+  virtual int32_t Push(TableContext &context) = 0;
   virtual int32_t pull_dense(float *values, size_t num) = 0;
   virtual int32_t push_dense(const float *values, size_t num) = 0;
   // for push global_step
diff --git a/paddle/fluid/distributed/ps/table/tensor_accessor.cc b/paddle/fluid/distributed/ps/table/tensor_accessor.cc
index 70a580c1e53a9..8c5349bff832c 100644
--- a/paddle/fluid/distributed/ps/table/tensor_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/tensor_accessor.cc
@@ -20,6 +20,16 @@ namespace distributed {
 
 int CommMergeAccessor::initialize() { return 0; }
 
+void CommMergeAccessor::GetTableInfo(AccessorInfo &info) {
+  info.dim = dim();
+  info.size = size();
+  info.select_dim = select_dim();
+  info.select_size = select_size();
+  info.update_dim = update_dim();
+  info.update_size = update_size();
+  info.fea_dim = fea_dim();
+}
+
 // value 维度
 size_t CommMergeAccessor::dim() { return 0; }
 
diff --git a/paddle/fluid/distributed/ps/table/tensor_accessor.h b/paddle/fluid/distributed/ps/table/tensor_accessor.h
index 5041b8fdf8733..1873b743b44ec 100644
--- a/paddle/fluid/distributed/ps/table/tensor_accessor.h
+++ b/paddle/fluid/distributed/ps/table/tensor_accessor.h
@@ -30,6 +30,7 @@ class CommMergeAccessor : public ValueAccessor {
   CommMergeAccessor() {}
   virtual ~CommMergeAccessor() {}
   virtual int initialize();
+  virtual void GetTableInfo(AccessorInfo &info);
   // value维度
   virtual size_t dim();
   // value各个维度的size
diff --git a/paddle/fluid/distributed/ps/table/tensor_table.h b/paddle/fluid/distributed/ps/table/tensor_table.h
index 64d81327acc55..23a62365c0f5a 100644
--- a/paddle/fluid/distributed/ps/table/tensor_table.h
+++ b/paddle/fluid/distributed/ps/table/tensor_table.h
@@ -48,6 +48,8 @@ class TensorTable : public Table {
   TensorTable() {}
   virtual ~TensorTable() {}
 
+  virtual int32_t Pull(TableContext &context) { return 0; }
+  virtual int32_t Push(TableContext &context) { return 0; }
   int32_t pull_dense(float *values, size_t num) override { return 0; }
 
   int32_t push_dense(const float *values, size_t num) override { return 0; }
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc
index 0588dbdf0fc61..c887cfeb71eef 100644
--- a/paddle/fluid/distributed/ps/wrapper/fleet.cc
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc
@@ -30,6 +30,32 @@ bool FleetWrapper::is_initialized_ = false;
 
 std::shared_ptr<paddle::distributed::PSCore> FleetWrapper::pserver_ptr_ = NULL;
 
+void FleetWrapper::Stop() { StopServer(); }
+
+void FleetWrapper::Load(WrapperContext& context) {
+  auto table_id = context.table_id;
+  if (table_id >= 0 && context.meta != "") {
+    LoadSparseOnServer(context.path, context.meta, context.table_id);
+    return;
+  }
+  if (table_id < 0) {  // laod all
+    LoadModel(context.path, context.mode);
+  } else {  // load one table
+    LoadModelOneTable(table_id, context.path, context.mode);
+  }
+  return;
+}
+
+void FleetWrapper::Save(WrapperContext& context) {
+  auto table_id = context.table_id;
+  if (table_id < 0) {
+    SaveModel(context.path, context.mode);
+  } else {
+    SaveModelOneTable(table_id, context.path, context.mode);
+  }
+  return;
+}
+
 void FleetWrapper::SetClient2ClientConfig(int request_timeout_ms,
                                           int connect_timeout_ms,
                                           int max_retry) {
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.h b/paddle/fluid/distributed/ps/wrapper/fleet.h
index a535b8c5bf8f9..d68c453c6d51b 100644
--- a/paddle/fluid/distributed/ps/wrapper/fleet.h
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.h
@@ -25,6 +25,7 @@ limitations under the License. */
 
 #include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h"
 #include "paddle/fluid/distributed/ps/service/ps_service/service.h"
+#include "paddle/fluid/distributed/ps/wrapper/ps_wrapper.h"
 #include "paddle/fluid/framework/archive.h"
 #include "paddle/fluid/framework/io/fs.h"
 #include "paddle/fluid/framework/io/shell.h"
@@ -54,7 +55,7 @@ using framework::Variable;
 
 using RpcCtxMap = std::unordered_map<std::string, CommContext>;
 
-class FleetWrapper {
+class FleetWrapper : public PSWrapper {
  public:
   virtual ~FleetWrapper() {}
   FleetWrapper() {
@@ -68,7 +69,13 @@ class FleetWrapper {
     // pserver request max retry
     client2client_max_retry_ = 3;
   }
+  virtual int32_t Initialize(InitContext& context) { return 0; }
 
+  virtual void Stop() override;
+
+  virtual void Load(WrapperContext& context) override;
+
+  virtual void Save(WrapperContext& context) override;
   // set client to client communication config
   void SetClient2ClientConfig(int request_timeout_ms, int connect_timeout_ms,
                               int max_retry);
diff --git a/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h b/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h
index c92835aa995ad..ca02ad31195ef 100755
--- a/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h
+++ b/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h
@@ -1,18 +1,84 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_WRAPPER_H_
-#define PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_WRAPPER_H_
-
-#endif  // PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_WRAPPER_H_
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <atomic>
+#include <ctime>
+#include <map>
+#include <memory>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h"
+#include "paddle/fluid/distributed/ps/service/ps_service/service.h"
+#include "paddle/fluid/framework/archive.h"
+#include "paddle/fluid/framework/io/fs.h"
+#include "paddle/fluid/framework/io/shell.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+
+namespace paddle {
+namespace framework {
+class Scope;
+class SelectedRows;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace distributed {
+
+class PSCore;
+
+using framework::LoDTensor;
+using framework::Scope;
+using phi::SelectedRows;
+using framework::Variable;
+
+using RpcCtxMap = std::unordered_map<std::string, CommContext>;
+
+struct WrapperContext {
+  uint32_t table_id;
+  const std::string path;
+  const int mode;
+  const std::string meta;
+};
+
+struct InitContext {
+  const std::vector<int> dev_ids;  // for gpu
+};
+
+class PSWrapper {
+ public:
+  virtual ~PSWrapper() {}
+  PSWrapper() {}
+  // init server
+
+  virtual int32_t Initialize(InitContext& context) = 0;
+
+  virtual void Stop() = 0;
+
+  virtual void Load(WrapperContext& context) = 0;
+
+  virtual void Save(WrapperContext& context) = 0;
+};
+
+}  // end namespace distributed
+}  // end namespace paddle
diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc
index eb98c89c99e47..b0d5add49565f 100644
--- a/paddle/fluid/distributed/store/tcp_store.cc
+++ b/paddle/fluid/distributed/store/tcp_store.cc
@@ -136,10 +136,6 @@ void MasterDaemon::run() {
     }
 
     for (size_t i = 1; i < fds.size(); i++) {
-      VLOG(0) << "fds.size:" << fds.size();
-      VLOG(0) << "fds.size-i:" << i;
-      VLOG(0) << "fds[i].revents:" << fds[i].revents;
-
       try {
         if (fds[i].revents == 0) {
           continue;
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index 2223334ccc442..cb46c38d4de4b 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -24,6 +24,9 @@ cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope serv
 set_source_files_properties(graph_node_split_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(graph_node_split_test SRCS graph_node_split_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
 
+set_source_files_properties(graph_table_sample_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(graph_table_sample_test SRCS graph_table_sample_test.cc DEPS  scope server communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
+
 set_source_files_properties(feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(feature_value_test SRCS feature_value_test.cc DEPS ${COMMON_DEPS} boost table)
 
diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc
index 9949dce4e933b..a2f495de3c953 100644
--- a/paddle/fluid/distributed/test/graph_node_split_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_split_test.cc
@@ -236,7 +236,7 @@ void RunGraphSplit() {
   sleep(2);
   std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
   dense_regions.insert(
-      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+      std::pair<int64_t, std::vector<paddle::distributed::Region>>(0, {}));
   auto regions = dense_regions[0];
 
   RunClient(dense_regions, 0, pserver_ptr_->get_service());
@@ -250,16 +250,16 @@ void RunGraphSplit() {
       worker_ptr_->load(0, std::string(edge_file_name), std::string("e>"));
   srand(time(0));
   pull_status.wait();
-  std::vector<std::vector<uint64_t>> _vs;
+  std::vector<std::vector<int64_t>> _vs;
   std::vector<std::vector<float>> vs;
   pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 10240001024), 4, _vs, vs, true);
+      0, std::vector<int64_t>(1, 10240001024), 4, _vs, vs, true);
   pull_status.wait();
   ASSERT_EQ(0, _vs[0].size());
   _vs.clear();
   vs.clear();
   pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 97), 4, _vs, vs, true);
+      0, std::vector<int64_t>(1, 97), 4, _vs, vs, true);
   pull_status.wait();
   ASSERT_EQ(3, _vs[0].size());
   std::remove(edge_file_name);
diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc
index 22c2d1e60992e..565d51379d5a8 100644
--- a/paddle/fluid/distributed/test/graph_node_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
@@ -48,10 +48,10 @@ namespace distributed = paddle::distributed;
 
 void testSampleNodes(
     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
-  std::vector<uint64_t> ids;
+  std::vector<int64_t> ids;
   auto pull_status = worker_ptr_->random_sample_nodes(0, 0, 6, ids);
-  std::unordered_set<uint64_t> s;
-  std::unordered_set<uint64_t> s1 = {37, 59};
+  std::unordered_set<int64_t> s;
+  std::unordered_set<int64_t> s1 = {37, 59};
   pull_status.wait();
   for (auto id : ids) s.insert(id);
   ASSERT_EQ(true, s.size() == s1.size());
@@ -106,14 +106,14 @@ void testFeatureNodeSerializeFloat64() {
 
 void testSingleSampleNeighboor(
     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
-  std::vector<std::vector<uint64_t>> vs;
+  std::vector<std::vector<int64_t>> vs;
   std::vector<std::vector<float>> vs1;
   auto pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 37), 4, vs, vs1, true);
+      0, std::vector<int64_t>(1, 37), 4, vs, vs1, true);
   pull_status.wait();
 
-  std::unordered_set<uint64_t> s;
-  std::unordered_set<uint64_t> s1 = {112, 45, 145};
+  std::unordered_set<int64_t> s;
+  std::unordered_set<int64_t> s1 = {112, 45, 145};
   for (auto g : vs[0]) {
     s.insert(g);
   }
@@ -126,7 +126,7 @@ void testSingleSampleNeighboor(
   vs.clear();
   vs1.clear();
   pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 96), 4, vs, vs1, true);
+      0, std::vector<int64_t>(1, 96), 4, vs, vs1, true);
   pull_status.wait();
   s1 = {111, 48, 247};
   for (auto g : vs[0]) {
@@ -147,30 +147,30 @@ void testAddNode(
     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
   worker_ptr_->clear_nodes(0);
   int total_num = 270000;
-  uint64_t id;
-  std::unordered_set<uint64_t> id_set;
+  int64_t id;
+  std::unordered_set<int64_t> id_set;
   for (int i = 0; i < total_num; i++) {
     while (id_set.find(id = rand()) != id_set.end())
       ;
     id_set.insert(id);
   }
-  std::vector<uint64_t> id_list(id_set.begin(), id_set.end());
+  std::vector<int64_t> id_list(id_set.begin(), id_set.end());
   std::vector<bool> weight_list;
   auto status = worker_ptr_->add_graph_node(0, id_list, weight_list);
   status.wait();
-  std::vector<uint64_t> ids[2];
+  std::vector<int64_t> ids[2];
   for (int i = 0; i < 2; i++) {
     auto sample_status =
         worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]);
     sample_status.wait();
   }
-  std::unordered_set<uint64_t> id_set_check(ids[0].begin(), ids[0].end());
+  std::unordered_set<int64_t> id_set_check(ids[0].begin(), ids[0].end());
   for (auto x : ids[1]) id_set_check.insert(x);
   ASSERT_EQ(id_set.size(), id_set_check.size());
   for (auto x : id_set) {
     ASSERT_EQ(id_set_check.find(x) != id_set_check.end(), true);
   }
-  std::vector<uint64_t> remove_ids;
+  std::vector<int64_t> remove_ids;
   for (auto p : id_set_check) {
     if (remove_ids.size() == 0)
       remove_ids.push_back(p);
@@ -187,7 +187,7 @@ void testAddNode(
         worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]);
     sample_status.wait();
   }
-  std::unordered_set<uint64_t> id_set_check1(ids[0].begin(), ids[0].end());
+  std::unordered_set<int64_t> id_set_check1(ids[0].begin(), ids[0].end());
   for (auto x : ids[1]) id_set_check1.insert(x);
   ASSERT_EQ(id_set_check1.size(), id_set_check.size());
   for (auto x : id_set_check1) {
@@ -196,14 +196,14 @@ void testAddNode(
 }
 void testBatchSampleNeighboor(
     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
-  std::vector<std::vector<uint64_t>> vs;
+  std::vector<std::vector<int64_t>> vs;
   std::vector<std::vector<float>> vs1;
-  std::vector<std::uint64_t> v = {37, 96};
+  std::vector<std::int64_t> v = {37, 96};
   auto pull_status =
       worker_ptr_->batch_sample_neighbors(0, v, 4, vs, vs1, false);
   pull_status.wait();
-  std::unordered_set<uint64_t> s;
-  std::unordered_set<uint64_t> s1 = {112, 45, 145};
+  std::unordered_set<int64_t> s;
+  std::unordered_set<int64_t> s1 = {112, 45, 145};
   for (auto g : vs[0]) {
     s.insert(g);
   }
@@ -417,7 +417,7 @@ void RunBrpcPushSparse() {
 
   std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
   dense_regions.insert(
-      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+      std::pair<int64_t, std::vector<paddle::distributed::Region>>(0, {}));
   auto regions = dense_regions[0];
 
   RunClient(dense_regions, 0, pserver_ptr_->get_service());
@@ -427,14 +427,14 @@ void RunBrpcPushSparse() {
       worker_ptr_->load(0, std::string(edge_file_name), std::string("e>"));
   srand(time(0));
   pull_status.wait();
-  std::vector<std::vector<uint64_t>> _vs;
+  std::vector<std::vector<int64_t>> _vs;
   std::vector<std::vector<float>> vs;
   testSampleNodes(worker_ptr_);
   sleep(5);
   testSingleSampleNeighboor(worker_ptr_);
   testBatchSampleNeighboor(worker_ptr_);
   pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 10240001024), 4, _vs, vs, true);
+      0, std::vector<int64_t>(1, 10240001024), 4, _vs, vs, true);
   pull_status.wait();
   ASSERT_EQ(0, _vs[0].size());
   paddle::distributed::GraphTable* g =
@@ -445,14 +445,14 @@ void RunBrpcPushSparse() {
   while (round--) {
     vs.clear();
     pull_status = worker_ptr_->batch_sample_neighbors(
-        0, std::vector<uint64_t>(1, 37), 1, _vs, vs, false);
+        0, std::vector<int64_t>(1, 37), 1, _vs, vs, false);
     pull_status.wait();
 
     for (int i = 0; i < ttl; i++) {
-      std::vector<std::vector<uint64_t>> vs1;
+      std::vector<std::vector<int64_t>> vs1;
       std::vector<std::vector<float>> vs2;
       pull_status = worker_ptr_->batch_sample_neighbors(
-          0, std::vector<uint64_t>(1, 37), 1, vs1, vs2, false);
+          0, std::vector<int64_t>(1, 37), 1, vs1, vs2, false);
       pull_status.wait();
       ASSERT_EQ(_vs[0].size(), vs1[0].size());
 
@@ -540,7 +540,7 @@ void RunBrpcPushSparse() {
 
   // Test Pull by step
 
-  std::unordered_set<uint64_t> count_item_nodes;
+  std::unordered_set<int64_t> count_item_nodes;
   // pull by step 2
   for (int test_step = 1; test_step < 4; test_step++) {
     count_item_nodes.clear();
@@ -558,18 +558,18 @@ void RunBrpcPushSparse() {
     ASSERT_EQ(count_item_nodes.size(), 12);
   }
 
-  std::pair<std::vector<std::vector<uint64_t>>, std::vector<float>> res;
+  std::pair<std::vector<std::vector<int64_t>>, std::vector<float>> res;
   res = client1.batch_sample_neighbors(
-      std::string("user2item"), std::vector<uint64_t>(1, 96), 4, true, false);
+      std::string("user2item"), std::vector<int64_t>(1, 96), 4, true, false);
   ASSERT_EQ(res.first[0].size(), 3);
-  std::vector<uint64_t> node_ids;
+  std::vector<int64_t> node_ids;
   node_ids.push_back(96);
   node_ids.push_back(37);
   res = client1.batch_sample_neighbors(std::string("user2item"), node_ids, 4,
                                        true, false);
 
   ASSERT_EQ(res.first[1].size(), 1);
-  std::vector<uint64_t> nodes_ids = client2.random_sample_nodes("user", 0, 6);
+  std::vector<int64_t> nodes_ids = client2.random_sample_nodes("user", 0, 6);
   ASSERT_EQ(nodes_ids.size(), 2);
   ASSERT_EQ(true, (nodes_ids[0] == 59 && nodes_ids[1] == 37) ||
                       (nodes_ids[0] == 37 && nodes_ids[1] == 59));
diff --git a/paddle/fluid/distributed/test/graph_table_sample_test.cc b/paddle/fluid/distributed/test/graph_table_sample_test.cc
new file mode 100644
index 0000000000000..65455028247dd
--- /dev/null
+++ b/paddle/fluid/distributed/test/graph_table_sample_test.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+#include <condition_variable>  // NOLINT
+#include <fstream>
+#include <iomanip>
+#include <string>
+#include <thread>  // NOLINT
+#include <unordered_set>
+#include <vector>
+#include "google/protobuf/text_format.h"
+
+#include <chrono>
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/ps/service/env.h"
+#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/ps/table/common_graph_table.h"
+#include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace operators = paddle::operators;
+namespace memory = paddle::memory;
+namespace distributed = paddle::distributed;
+
+std::vector<std::string> edges = {
+    std::string("37\t45\t0.34"),  std::string("37\t145\t0.31"),
+    std::string("37\t112\t0.21"), std::string("96\t48\t1.4"),
+    std::string("96\t247\t0.31"), std::string("96\t111\t1.21"),
+    std::string("59\t45\t0.34"),  std::string("59\t145\t0.31"),
+    std::string("59\t122\t0.21"), std::string("97\t48\t0.34"),
+    std::string("97\t247\t0.31"), std::string("97\t111\t0.21")};
+// odd id:96 48 122 112
+char edge_file_name[] = "edges.txt";
+
+std::vector<std::string> nodes = {
+    std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"),
+    std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"),
+    std::string("user\t59\ta 0.11\tb 11 14"),
+    std::string("user\t97\ta 0.11\tb 12 11"),
+    std::string("item\t45\ta 0.21"),
+    std::string("item\t145\ta 0.21"),
+    std::string("item\t112\ta 0.21"),
+    std::string("item\t48\ta 0.21"),
+    std::string("item\t247\ta 0.21"),
+    std::string("item\t111\ta 0.21"),
+    std::string("item\t46\ta 0.21"),
+    std::string("item\t146\ta 0.21"),
+    std::string("item\t122\ta 0.21"),
+    std::string("item\t49\ta 0.21"),
+    std::string("item\t248\ta 0.21"),
+    std::string("item\t113\ta 0.21")};
+char node_file_name[] = "nodes.txt";
+
+void prepare_file(char file_name[], std::vector<std::string> data) {
+  std::ofstream ofile;
+  ofile.open(file_name);
+  for (auto x : data) {
+    ofile << x << std::endl;
+  }
+
+  ofile.close();
+}
+
+void testGraphSample() {
+#ifdef PADDLE_WITH_HETERPS
+  ::paddle::distributed::GraphParameter table_proto;
+  table_proto.set_gpups_mode(true);
+  table_proto.set_gpups_mode_shard_num(127);
+  table_proto.set_gpu_num(2);
+
+  distributed::GraphTable graph_table, graph_table1;
+  graph_table.initialize(table_proto);
+  prepare_file(edge_file_name, edges);
+  graph_table.load(std::string(edge_file_name), std::string("e>"));
+  std::vector<paddle::framework::GpuPsCommGraph> res;
+  std::promise<int> prom;
+  std::future<int> fut = prom.get_future();
+  graph_table.set_graph_sample_callback(
+      [&res, &prom](std::vector<paddle::framework::GpuPsCommGraph> &res0) {
+        res = res0;
+        prom.set_value(0);
+      });
+  graph_table.start_graph_sampling();
+  fut.get();
+  graph_table.end_graph_sampling();
+  ASSERT_EQ(2, res.size());
+  // 37 59 97
+  for (int i = 0; i < (int)res[1].node_size; i++) {
+    std::cout << res[1].node_list[i].node_id << std::endl;
+  }
+  ASSERT_EQ(3, res[1].node_size);
+
+  ::paddle::distributed::GraphParameter table_proto1;
+  table_proto1.set_gpups_mode(true);
+  table_proto1.set_gpups_mode_shard_num(127);
+  table_proto1.set_gpu_num(2);
+  table_proto1.set_gpups_graph_sample_class("BasicBfsGraphSampler");
+  table_proto1.set_gpups_graph_sample_args("5,5,1,1");
+  graph_table1.initialize(table_proto1);
+  graph_table1.load(std::string(edge_file_name), std::string("e>"));
+  std::vector<paddle::framework::GpuPsCommGraph> res1;
+  std::promise<int> prom1;
+  std::future<int> fut1 = prom1.get_future();
+  graph_table1.set_graph_sample_callback(
+      [&res1, &prom1](std::vector<paddle::framework::GpuPsCommGraph> &res0) {
+        res1 = res0;
+        prom1.set_value(0);
+      });
+  graph_table1.start_graph_sampling();
+  fut1.get();
+  graph_table1.end_graph_sampling();
+  // distributed::BasicBfsGraphSampler *sampler1 =
+  //     (distributed::BasicBfsGraphSampler *)graph_table1.get_graph_sampler();
+  //     sampler1->start_graph_sampling();
+  //     std::this_thread::sleep_for (std::chrono::seconds(1));
+  // std::vector<paddle::framework::GpuPsCommGraph> res1;// =
+  // sampler1->fetch_sample_res();
+  ASSERT_EQ(2, res1.size());
+  // odd id:96 48 122 112
+  for (int i = 0; i < (int)res1[0].node_size; i++) {
+    std::cout << res1[0].node_list[i].node_id << std::endl;
+  }
+  ASSERT_EQ(4, res1[0].node_size);
+#endif
+}
+
+TEST(testGraphSample, Run) { testGraphSample(); }
diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index 698a698fc6d18..691a381405e9a 100644
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -1,6 +1,7 @@
-set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
+set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node custom_operator_node)
+
 set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
-set(generated_deps dygraph_function dygraph_node)
+set(generated_deps final_dygraph_function final_dygraph_node dygraph_function dygraph_node)
 
 if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     message("Performing Eager Dygraph Auto Code Generation")
@@ -9,6 +10,8 @@ endif()
 
 add_subdirectory(api)
 add_subdirectory(accumulation)
+add_subdirectory(custom_operator)
+
 
 cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor)
 cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator)
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 3a2ec403c0a59..9c4089af092e4 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -24,7 +24,7 @@
 #include "paddle/fluid/platform/errors.h"
 
 #include "glog/logging.h"
-
+DECLARE_bool(retain_grad_for_all_tensor);
 namespace egr {
 
 static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
@@ -39,8 +39,8 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
 }
 
 std::vector<std::vector<paddle::experimental::Tensor>> GradNodeAccumulation::
-operator()(
-    const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
+operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+           bool create_graph) {
   VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation";
   PADDLE_ENFORCE(grads.size() == 1,
                  paddle::platform::errors::Fatal(
@@ -62,7 +62,7 @@ operator()(
     grad_out = grads[0][0];
   }
 
-  if (!weak_grad_.expired()) {
+  if (!weak_grad_.expired() && FLAGS_retain_grad_for_all_tensor) {
     auto grad = weak_grad_.lock();
     CopyOrAddTensor(grad.get(), grad_out);
   }
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index 07fa40165167c..a91a0b6e34c0d 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -35,8 +35,15 @@ class GradNodeAccumulation : public GradNodeBase {
 
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
-      override;
+      const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      bool create_graph = false) override;
+
+  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+
+  bool IsTensorWrappersCleared() override {
+    VLOG(6) << "Do nothing here now";
+    return false;
+  }
 
   std::string name() { return "GradNodeAccumulation"; }
 
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index 5a2595b9103e4..0bc998a03a80b 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -145,8 +145,8 @@ void GradNodeScale::SetTensorWrappers_X(
 void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; }
 
 std::vector<std::vector<paddle::experimental::Tensor>> GradNodeScale::
-operator()(
-    const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
+operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+           bool create_graph) {
   // 1. Check Output Size
   PADDLE_ENFORCE(
       ((grads.size() == 1) && (grads[0].size() == 1)),
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
index 247fde6ed1f86..e263f73a6b8a4 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
@@ -39,8 +39,15 @@ class GradNodeScale : public GradNodeBase {
 
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
-      override;
+      const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      bool create_graph = false) override;
+
+  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+
+  bool IsTensorWrappersCleared() override {
+    VLOG(6) << "Do nothing here now";
+    return false;
+  }
 
   void SetTensorWrappers_X(
       const std::vector<paddle::experimental::Tensor>& tensors);
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
index ba6a936d68651..1be3b31de00a6 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
@@ -86,9 +86,9 @@ paddle::experimental::Tensor scale(const paddle::experimental::Tensor& x,
     scale_node->SetTensorWrappers_X({x});
 
     // Set Grad out rank as same as fwd input and set stop gradient to bwd
-    scale_node->SetGradOutMeta(p_autograd_in, /*slot id*/ 0);
+    scale_node->SetGradOutMeta(x, /*slot id*/ 0);
     // Set Grad out rank as same as fwd input and set stop gradient to bwd
-    scale_node->SetGradInMeta(p_autograd_out, /*slot id*/ 0);
+    scale_node->SetGradInMeta(out, /*slot id*/ 0);
 
     // Set History for output set current Grad Node for
     EagerUtils::SetHistory(p_autograd_out, scale_node);
diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h
index 00578d9a359a3..a9a62fcd50e7a 100644
--- a/paddle/fluid/eager/api/utils/global_utils.h
+++ b/paddle/fluid/eager/api/utils/global_utils.h
@@ -18,7 +18,7 @@
 #include <atomic>
 #include <memory>
 #include "paddle/fluid/imperative/tracer.h"
-
+#include "paddle/phi/api/ext/op_meta_info.h"
 namespace egr {
 
 class UniqueNameGenerator {
@@ -70,6 +70,21 @@ class Controller {
 
   void SetInEagerMode(bool in_eager_mode) { in_eager_mode_ = in_eager_mode; }
 
+  const std::unordered_map<std::string, std::vector<paddle::OpMetaInfo>>&
+  GetOpMetaInfoMap() {
+    return op_meta_info_map_;
+  }
+
+  void MergeOpMetaInfoMap(const std::unordered_map<
+                          std::string, std::vector<paddle::OpMetaInfo>>& map) {
+    op_meta_info_map_.insert(map.begin(), map.end());
+  }
+
+  std::unordered_map<std::string, std::vector<std::unordered_map<int, int>>>&
+  GetCustomEdgesSlotMap() {
+    return custom_edges_slot_map_;
+  }
+
  private:
   Controller() = default;
   static Controller* controller_;
@@ -77,6 +92,11 @@ class Controller {
       new paddle::imperative::Tracer()};
   // TODO(jiabin): remove when we don't need imperative.
   bool in_eager_mode_{false};
+  std::unordered_map<std::string, std::vector<paddle::OpMetaInfo>>
+      op_meta_info_map_;
+  /* op_type : {{grad_outputs}, {grad_inputs}, {input}, {output}, {attrs}}*/
+  std::unordered_map<std::string, std::vector<std::unordered_map<int, int>>>
+      custom_edges_slot_map_;
   DISABLE_COPY_AND_ASSIGN(Controller);
 };
 
diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc
index 77c39d1b0a37c..b485beca57a21 100644
--- a/paddle/fluid/eager/api/utils/tensor_utils.cc
+++ b/paddle/fluid/eager/api/utils/tensor_utils.cc
@@ -30,7 +30,8 @@ namespace egr_utils_api {
 
 bool IsLeafTensor(const paddle::experimental::Tensor& target) {
   std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(target);
-  if (std::dynamic_pointer_cast<GradNodeAccumulation>(grad_node)) {
+  if (!grad_node ||
+      std::dynamic_pointer_cast<GradNodeAccumulation>(grad_node)) {
     return true;
   }
 
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index dc79a8a45a246..df2cdc35626a8 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -56,23 +56,29 @@ static std::string LegalizeVariableName(const std::string& var_name) {
   return ret;
 }
 
-static bool IgnoreGradAttribute(const std::string& op_type,
-                                const std::string& attr_name) {
-  // Attributes in operators_with_attrs are created manually during code
-  // generation
-  // We should ignore these arbitrary attrs when setting up grad attribute map
-  if (operators_with_attrs.count(op_type)) {
-    if (operators_with_attrs[op_type].count(attr_name)) {
-      return true;
-    }
-  }
+static std::string HandleDynamicGradAttributes(const std::string& fwd_op_type,
+                                               const std::string& attrs_name) {
+  std::string additional_grad_attrs_str = "";
+
+  if (fwd_op_type == "sum") {
+    const char* GRAD_ATTRS_TEMPLATE = "  %s[\"%s\"] = %s;\n";
+    additional_grad_attrs_str = paddle::string::Sprintf(
+        GRAD_ATTRS_TEMPLATE, attrs_name, "scale", "float(1.0)");
+    additional_grad_attrs_str += paddle::string::Sprintf(
+        GRAD_ATTRS_TEMPLATE, attrs_name, "bias", "float(0.0f)");
+    additional_grad_attrs_str += paddle::string::Sprintf(
+        GRAD_ATTRS_TEMPLATE, attrs_name, "bias_after_scale", "bool(true)");
+
+  } else if (fwd_op_type == "scale") {
+    const char* GRAD_ATTRS_TEMPLATE = "  %s[\"%s\"] = %s;\n";
 
-  // Only allow SumOp
-  if (op_type != "sum") {
-    return true;
+    additional_grad_attrs_str += paddle::string::Sprintf(
+        GRAD_ATTRS_TEMPLATE, attrs_name, "bias", "float(0.0f)");
+    additional_grad_attrs_str += paddle::string::Sprintf(
+        GRAD_ATTRS_TEMPLATE, attrs_name, "bias_after_scale", "bool(true)");
   }
 
-  return false;
+  return additional_grad_attrs_str;
 }
 
 static void PrepareAttrMapForOps() {
@@ -973,7 +979,9 @@ static bool CollectGradInformationFromOpInfo(
 /* --------------------------------------------------- */
 static std::string GenerateGradNodeCreationContent(
     const ForwardGenerationInfo& fwd_info,
-    const GradNodeGenerationInfo& bwd_info) {
+    const GradNodeGenerationInfo& bwd_info,
+    const std::string& trace_op_body_str,
+    std::map<std::string, std::string> inplace_map = {}) {
   VLOG(6) << "Generating GradNode Creation codes";
 
   const std::string& op_type = fwd_info.GetOpType();
@@ -992,7 +1000,8 @@ static std::string GenerateGradNodeCreationContent(
   // If single output slotname and not duplicable,
   // then generate: "egr::AutogradMeta* p_autograd_out =
   // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")"
-  std::string get_autograd_meta_str = "  // Prepare Autograd Meta \n";
+  std::string get_input_autograd_meta_str = "  // Prepare Autograd Meta \n";
+  std::string get_output_autograd_meta_str = "";
   // If single output slotname and not duplicable,
   // then generate: "egr::AutogradMeta* p_autograd_out =
   // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
@@ -1000,22 +1009,39 @@ static std::string GenerateGradNodeCreationContent(
     const std::string& output_name = output.name();
     const std::string& output_autograd_name = "p_autograd_" + output_name;
 
+    // output autograd_meta should be got after running TraceOP.
     if (output.duplicable()) {
       const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
-          "  std::vector<egr::AutogradMeta*> %s = "
+          "    std::vector<egr::AutogradMeta*> %s = "
           "egr::EagerUtils::autograd_meta(&%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
+      get_output_autograd_meta_str += paddle::string::Sprintf(
           GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
     } else {
-      const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
-          "  egr::AutogradMeta* %s = "
-          "egr::EagerUtils::autograd_meta(&%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
-          GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
+      // In inplace op, the case where output is duplicable is not considered.
+      // Replace output directly with input in inplace op.
+      if (!inplace_map.empty() && inplace_map.count(output_name)) {
+        auto inplace_input_name = inplace_map[output_name];
+        const std::string& inplace_input_autograd_name =
+            "p_autograd_" + inplace_input_name;
+        const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
+            "    %s = egr::EagerUtils::autograd_meta(&%s);\n";
+        get_output_autograd_meta_str += paddle::string::Sprintf(
+            GET_SINGLE_AUTOGRAD_META_TEMPLATE, inplace_input_autograd_name,
+            inplace_input_name);
+      } else {
+        const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
+            "    egr::AutogradMeta* %s = "
+            "egr::EagerUtils::autograd_meta(&%s);\n";
+        get_output_autograd_meta_str +=
+            paddle::string::Sprintf(GET_SINGLE_AUTOGRAD_META_TEMPLATE,
+                                    output_autograd_name, output_name);
+      }
     }
   }
   VLOG(6) << "Generated outputs autograd_meta";
 
+  // input autograd_meta should be got before running TraceOP (for checking
+  // inplace).
   for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
     const std::string& input_autograd_name = "p_autograd_" + input_name;
@@ -1024,28 +1050,46 @@ static std::string GenerateGradNodeCreationContent(
       const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
           "  std::vector<egr::AutogradMeta*> %s = "
           "egr::EagerUtils::nullable_autograd_meta(%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
+      get_input_autograd_meta_str += paddle::string::Sprintf(
           GET_MULTI_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name);
 
     } else if (input.dispensable()) {
       const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
           "  egr::AutogradMeta* %s = "
           "egr::EagerUtils::nullable_autograd_meta(%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
+      get_input_autograd_meta_str += paddle::string::Sprintf(
           GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name);
 
     } else {
       const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
           "  egr::AutogradMeta* %s = "
           "egr::EagerUtils::nullable_autograd_meta(%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
+      get_input_autograd_meta_str += paddle::string::Sprintf(
           GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name);
     }
   }
   VLOG(6) << "Generated inputs autograd_meta";
 
+  // check inplace input to avoid inplace operations on leaf nodes with
+  // stop_gradient=False.
+  std::string check_inplace_str = "";
+  if (!inplace_map.empty()) {
+    const char* CHECKING_INPLACE_TEMPLATE =
+        "  // Check Inplace\n"
+        "  egr::EagerUtils::CheckInplace(%s, p_autograd_%s, "
+        "require_any_grad);\n";
+    for (auto& inplace_pair : inplace_map) {
+      std::string inplace_name = inplace_pair.second;
+      check_inplace_str += paddle::string::Sprintf(CHECKING_INPLACE_TEMPLATE,
+                                                   inplace_name, inplace_name);
+    }
+    VLOG(6) << "Check Inplace Input";
+  }
+
   std::string prepare_autograd_meta_str = "";
-  prepare_autograd_meta_str += get_autograd_meta_str;
+  // only generate input autograd_meta in temporary.
+  // output autograd_meta will be generated after running TraceOP.
+  prepare_autograd_meta_str += get_input_autograd_meta_str;
   prepare_autograd_meta_str += "\n";
 
   // [GradOpNode] GetTraceBackward
@@ -1060,7 +1104,7 @@ static std::string GenerateGradNodeCreationContent(
   size_t bwd_in_slot_num = out_vars.size();
   size_t bwd_out_slot_num = in_vars.size();
   const char* GRAD_OP_NODE_TEMPLATE =
-      "    auto grad_node = std::make_shared<GradNode%s>(%d, %d);\n";
+      "      auto grad_node = std::make_shared<GradNode%s>(%d, %d);\n";
   grad_node_creation_str += "    // Create GradOpNode\n";
   grad_node_creation_str += paddle::string::Sprintf(
       GRAD_OP_NODE_TEMPLATE, op_type, bwd_in_slot_num, bwd_out_slot_num);
@@ -1069,14 +1113,14 @@ static std::string GenerateGradNodeCreationContent(
   VLOG(6) << "Generated GradOpNode construction";
 
   // [GradOpNode] Set Attrs
-  grad_node_creation_str += "    // Set Attributes\n";
-  grad_node_creation_str += "    grad_node->SetAttrMap(std::move(attrs));\n";
+  grad_node_creation_str += "      // Set Attributes\n";
+  grad_node_creation_str += "      grad_node->SetAttrMap(std::move(attrs));\n";
   grad_node_creation_str +=
-      "    grad_node->SetDefaultAttrMap(std::move(default_attrs));\n";
+      "      grad_node->SetDefaultAttrMap(std::move(default_attrs));\n";
   grad_node_creation_str += "\n";
 
   // [GradOpNode] Set TensorWrappers
-  grad_node_creation_str += "    // Set Tensor Wrappers\n";
+  grad_node_creation_str += "      // Set Tensor Wrappers\n";
   for (const auto& iter : op_base_infos) {
     const std::map<std::string, std::string>& grad_ins_fwd_slotname_map =
         iter.GetGradInsFwdSlotnameMap();
@@ -1088,10 +1132,18 @@ static std::string GenerateGradNodeCreationContent(
         full_reserved = "true";
       }
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
-          "    grad_node->SetTensorWrapper%s(%s, %s);\n";
-      grad_node_creation_str += paddle::string::Sprintf(
-          SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, tensor_wrapper_name,
-          full_reserved);
+          "      grad_node->SetTensorWrapper%s(%s, %s);\n";
+      // Replace output directly with input in inplace op.
+      if (!inplace_map.empty() && inplace_map.count(tensor_wrapper_name)) {
+        auto inplace_input_name = inplace_map[tensor_wrapper_name];
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
+            inplace_input_name, full_reserved);
+      } else {
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
+            tensor_wrapper_name, full_reserved);
+      }
     }
   }
   grad_node_creation_str += "\n";
@@ -1109,12 +1161,12 @@ static std::string GenerateGradNodeCreationContent(
       size_t input_position = fwd_inputs_name_pos_map.at(input_name);
 
       const char* SET_GRAD_OUT_META_TEMPLATE =
-          "    grad_node->SetGradOutMeta(%s, %d);\n";
+          "      grad_node->SetGradOutMeta(%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
-          SET_GRAD_OUT_META_TEMPLATE, input_autograd_name, input_position);
+          SET_GRAD_OUT_META_TEMPLATE, input_name, input_position);
 
       const char* ADD_EDGES_TEMPLATE =
-          "    if(%s) grad_node->AddEdges(%s, %d);\n";
+          "      if(%s) grad_node->AddEdges(%s, %d);\n";
       grad_node_creation_str +=
           paddle::string::Sprintf(ADD_EDGES_TEMPLATE, input_autograd_name,
                                   input_autograd_name, input_position);
@@ -1123,11 +1175,11 @@ static std::string GenerateGradNodeCreationContent(
       size_t input_position = fwd_inputs_name_pos_map.at(input_name);
 
       const char* SET_GRAD_OUT_META_TEMPLATE =
-          "    grad_node->SetGradOutMeta(&%s, %d);\n";
+          "      grad_node->SetGradOutMeta(%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
-          SET_GRAD_OUT_META_TEMPLATE, input_autograd_name, input_position);
+          SET_GRAD_OUT_META_TEMPLATE, input_name, input_position);
 
-      const char* ADD_EDGES_TEMPLATE = "    grad_node->AddEdges(&%s, %d);\n";
+      const char* ADD_EDGES_TEMPLATE = "      grad_node->AddEdges(&%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
           ADD_EDGES_TEMPLATE, input_autograd_name, input_position);
     }
@@ -1139,73 +1191,125 @@ static std::string GenerateGradNodeCreationContent(
   std::string pass_stop_gradient_args = "false";
   for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
-    const std::string& output_autograd_name = "p_autograd_" + output_name;
-    size_t output_position = fwd_outputs_name_pos_map.at(output_name);
-
-    // Intermediate Tensor does not require SetHistory, nor RetainGrad
-
-    if (output.duplicable()) {
-      pass_stop_gradient_args += ", &" + output_autograd_name;
+    // Replace output directly with input in inplace op.
+    if (!inplace_map.empty() && inplace_map.count(output_name)) {
+      auto inplace_input_name = inplace_map[output_name];
+      const std::string& inplace_input_autograd_name =
+          "p_autograd_" + inplace_input_name;
+      size_t output_position = fwd_outputs_name_pos_map.at(output_name);
+
+      // Intermediate Tensor does not require SetHistory, nor RetainGrad
+      pass_stop_gradient_args += ", " + inplace_input_autograd_name;
       const char* SET_OUT_RANK_TEMPLATE =
-          "    egr::EagerUtils::SetOutRankWithSlot(&%s, %d);\n";
+          "      egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
-          SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position);
+          SET_OUT_RANK_TEMPLATE, inplace_input_autograd_name, output_position);
 
       // Intermediate Tensor does not require SetHistory
       if (!output.intermediate()) {
         const char* SET_HISTORY_TEMPLATE =
-            "    egr::EagerUtils::SetHistory(&%s, grad_node);\n";
-        grad_node_creation_str +=
-            paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name);
+            "      egr::EagerUtils::SetHistory(%s, grad_node);\n";
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_HISTORY_TEMPLATE, inplace_input_autograd_name);
       }
       const char* SET_GRAD_IN_META_TEMPLATE =
-          "    grad_node->SetGradInMeta(&%s, %d);\n";
+          "      grad_node->SetGradInMeta(%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
-          SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position);
+          SET_GRAD_IN_META_TEMPLATE, inplace_input_name, output_position);
 
+      // Intermediate Tensor does not require CheckAndRetainGrad
+      if (!output.intermediate()) {
+        VLOG(6) << "Generated Call RetainGradForTensor";
+        const char* RETAIN_GRAD_TEMPLATE =
+            "      egr::EagerUtils::CheckAndRetainGrad(%s);\n";
+        grad_node_creation_str +=
+            paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, inplace_input_name);
+      }
     } else {
-      pass_stop_gradient_args += ", " + output_autograd_name;
-      const char* SET_OUT_RANK_TEMPLATE =
-          "    egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n";
-      grad_node_creation_str += paddle::string::Sprintf(
-          SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position);
+      const std::string& output_autograd_name = "p_autograd_" + output_name;
+      size_t output_position = fwd_outputs_name_pos_map.at(output_name);
 
-      // Intermediate Tensor does not require SetHistory
+      // Intermediate Tensor does not require SetHistory, nor RetainGrad
+
+      if (output.duplicable()) {
+        pass_stop_gradient_args += ", &" + output_autograd_name;
+        const char* SET_OUT_RANK_TEMPLATE =
+            "      egr::EagerUtils::SetOutRankWithSlot(&%s, %d);\n";
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position);
+
+        // Intermediate Tensor does not require SetHistory
+        if (!output.intermediate()) {
+          const char* SET_HISTORY_TEMPLATE =
+              "      egr::EagerUtils::SetHistory(&%s, grad_node);\n";
+          grad_node_creation_str += paddle::string::Sprintf(
+              SET_HISTORY_TEMPLATE, output_autograd_name);
+        }
+        const char* SET_GRAD_IN_META_TEMPLATE =
+            "      grad_node->SetGradInMeta(%s, %d);\n";
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_GRAD_IN_META_TEMPLATE, output_name, output_position);
+
+      } else {
+        pass_stop_gradient_args += ", " + output_autograd_name;
+        const char* SET_OUT_RANK_TEMPLATE =
+            "      egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n";
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position);
+
+        // Intermediate Tensor does not require SetHistory
+        if (!output.intermediate()) {
+          const char* SET_HISTORY_TEMPLATE =
+              "      egr::EagerUtils::SetHistory(%s, grad_node);\n";
+          grad_node_creation_str += paddle::string::Sprintf(
+              SET_HISTORY_TEMPLATE, output_autograd_name);
+        }
+        const char* SET_GRAD_IN_META_TEMPLATE =
+            "      grad_node->SetGradInMeta(%s, %d);\n";
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_GRAD_IN_META_TEMPLATE, output_name, output_position);
+      }
+
+      // Intermediate Tensor does not require CheckAndRetainGrad
       if (!output.intermediate()) {
-        const char* SET_HISTORY_TEMPLATE =
-            "    egr::EagerUtils::SetHistory(%s, grad_node);\n";
+        VLOG(6) << "Generated Call RetainGradForTensor";
+        const char* RETAIN_GRAD_TEMPLATE =
+            "      egr::EagerUtils::CheckAndRetainGrad(%s);\n";
         grad_node_creation_str +=
-            paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name);
+            paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name);
       }
-      const char* SET_GRAD_IN_META_TEMPLATE =
-          "    grad_node->SetGradInMeta(%s, %d);\n";
-      grad_node_creation_str += paddle::string::Sprintf(
-          SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position);
-    }
-
-    // Intermediate Tensor does not require CheckAndRetainGrad
-    if (!output.intermediate()) {
-      VLOG(6) << "Generated Call RetainGradForTensor";
-      const char* RETAIN_GRAD_TEMPLATE =
-          "    egr::EagerUtils::CheckAndRetainGrad(%s);\n";
-      grad_node_creation_str +=
-          paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name);
     }
   }
   VLOG(6) << "Generated SetGradIn/OutMeta";
 
   // [Generation] GradNode Creation
+  // After getting require_any_grad, firstly use CheckInplace method for inplace
+  // op.
+  // Then execute TraceOp and generate output autograd_meta.
+  // Finally, Construct GradNode. (Replace output directly with input in inplace
+  // op.)
+  // Add event record
+  std::string event_name = op_type + " node_creation";
   const char* GRAD_NODE_CREATION_TEMPLATE =
-      "  %s"
+      "%s"
       "  bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n"
-      "  if(require_any_grad) {\n"
-      "    VLOG(6) << \" Construct Grad for %s \"; \n"
-      "    egr::EagerUtils::PassStopGradient(%s);\n"
-      "%s\n  }";
+      "%s\n"
+      "%s"
+      "  {\n"
+      "    paddle::platform::RecordEvent node_creation_record_event(\"%s\", "
+      "paddle::platform::TracerEventType::Operator, 1);\n"
+      "%s"
+      "    if(require_any_grad) {\n"
+      "      VLOG(6) << \" Construct Grad for %s \"; \n"
+      "      egr::EagerUtils::PassStopGradient(%s);\n"
+      "  %s\n"
+      "    }\n"
+      "  }";
   std::string grad_node_creation_body_str = paddle::string::Sprintf(
       GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str,
-      compute_require_grad_args, op_type, pass_stop_gradient_args,
-      grad_node_creation_str);
+      compute_require_grad_args, check_inplace_str, trace_op_body_str,
+      event_name, get_output_autograd_meta_str, op_type,
+      pass_stop_gradient_args, grad_node_creation_str);
 
   return grad_node_creation_body_str;
 }
@@ -1215,7 +1319,8 @@ static std::string GenerateGradNodeCreationContent(
 /* -------------------------------- */
 static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     const ForwardGenerationInfo& fwd_info,
-    const GradNodeGenerationInfo& bwd_info) {
+    const GradNodeGenerationInfo& bwd_info,
+    std::map<std::string, std::string> inplace_map = {}) {
   /* --- Process Forward Info ---*/
   const std::string& op_type = fwd_info.GetOpType();
   const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map =
@@ -1295,8 +1400,21 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
       core_ops_args_type_info[op_type][input_position] = "list";
     } else {
-      const char* FWD_INS_ARG_TEMPLATE =
-          "const paddle::experimental::Tensor& %s";
+      // inplace tensor can't be const
+      const char* FWD_INS_ARG_TEMPLATE;
+      bool flag_find_input_name = false;
+      if (!inplace_map.empty()) {
+        for (auto& inplace_pair : inplace_map) {
+          if (inplace_pair.second == input_name) {
+            flag_find_input_name = true;
+            FWD_INS_ARG_TEMPLATE = "paddle::experimental::Tensor& %s";
+            break;
+          }
+        }
+      }
+      if (!flag_find_input_name) {
+        FWD_INS_ARG_TEMPLATE = "const paddle::experimental::Tensor& %s";
+      }
       input_args_str_list[input_position] =
           paddle::string::Sprintf(FWD_INS_ARG_TEMPLATE, input_name);
 
@@ -1356,6 +1474,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
   // [Generation] Get Outs Map
   std::string outs_contents_str = "";
+  std::string inplace_mapping_str = "";
   for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
     std::string outnum = "1";
@@ -1398,6 +1517,22 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       }
       core_ops_args_info[op_type].push_back(output_var_name);
 
+    } else if (!inplace_map.empty() && inplace_map.count(output_name)) {
+      // In inplace op, replace the output with the input directly.
+      PADDLE_ENFORCE_NE(
+          inplace_map[output_name], "",
+          paddle::platform::errors::InvalidArgument(
+              "Inplace op %s has no input corresponding to output %s.", op_type,
+              output_name));
+      const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", ins[\"%s\"] },";
+      auto inplace_input_name = inplace_map[output_name];
+      outs_contents_str += paddle::string::Sprintf(
+          FWD_OUTS_CONTENT_TEMPLATE, output_name, inplace_input_name);
+
+      // inplace_map used in TraceOp.
+      const char* INPLACE_MAPPING_TEMPLATE = R"({"%s", "%s"},)";
+      inplace_mapping_str += paddle::string::Sprintf(
+          INPLACE_MAPPING_TEMPLATE, inplace_input_name, output_name);
     } else {
       if (output.duplicable()) {
         outnum = output_name + "Num";
@@ -1424,6 +1559,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   }
   if (outs_contents_str.size() > 0)
     outs_contents_str.pop_back();  // Remove trailing ","
+  if (inplace_mapping_str.size() > 0)
+    inplace_mapping_str.pop_back();  // Remove trailing ","
 
   const char* FWD_OUTS_MAP_TEMPLATE =
       "  std::map<std::string, "
@@ -1457,6 +1594,12 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   dygraph_function_args_str +=
       ", const paddle::framework::AttributeMap& attr_map";
 
+  /* --------- Generate TraceOp ----- */
+  // TraceOp should be run after compute require_any_grad. (for checking
+  // inplace)
+  // `trace_op_body_str` will be passed as a parameter to
+  // `GenerateGradNodeCreationContent`.
+  std::string trace_op_body_str = "";
   // [Generation] Get TraceOp
   const char* FWD_TRACE_OP_TEMPLATE =
       "  paddle::framework::AttributeMap attrs = attr_map;\n"
@@ -1464,11 +1607,12 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       "  egr::Controller::Instance().GetCurrentTracer()->TraceOp(\"%s\", ins, "
       "outs, attrs, \n"
       "     egr::Controller::Instance().GetExpectedPlace(),\n"
-      "     &default_attrs, true, {});\n";
-  std::string trace_op_str =
-      paddle::string::Sprintf(FWD_TRACE_OP_TEMPLATE, op_type);
-  generated_function_body += trace_op_str;
-  generated_function_body += "\n";
+      "     &default_attrs, true, {%s});\n";
+  std::string trace_op_str = paddle::string::Sprintf(
+      FWD_TRACE_OP_TEMPLATE, op_type, inplace_mapping_str);
+
+  trace_op_body_str += trace_op_str;
+  trace_op_body_str += "\n";
 
   VLOG(6) << "Generated AttrMap & TraceOp";
 
@@ -1533,34 +1677,64 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
               output_varname, output_var_args_name);
         }
       } else {
-        const char* FWD_OUT_TENSOR_TEMPLATE =
-            "  paddle::experimental::Tensor %s;\n"
-            "  egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n";
-        out_tensor_str =
-            paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname,
-                                    output_name, output_varname);
+        if (!inplace_map.empty() && inplace_map.count(output_name)) {
+          // Modify meta info of inplace tensor.
+          // Bump inplace version of inplace tensor.
+          auto inplace_input_name = inplace_map[output_name];
+          const char* FWD_OUT_TENSOR_TEMPLATE =
+              "  egr::EagerUtils::ModifyInplaceInput(outs[\"%s\"][0], &%s);\n"
+              "  %s.bump_inplace_version();\n"
+              "  VLOG(3) << \"Tensor(\" << %s.name() << \") uses Inplace "
+              "Strategy.\";\n";
+          out_tensor_str = paddle::string::Sprintf(
+              FWD_OUT_TENSOR_TEMPLATE, output_name, inplace_input_name,
+              inplace_input_name, inplace_input_name);
+        } else {
+          const char* FWD_OUT_TENSOR_TEMPLATE =
+              "  paddle::experimental::Tensor %s;\n"
+              "  egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n";
+          out_tensor_str =
+              paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname,
+                                      output_name, output_varname);
+        }
       }
       return_types[return_position] = "paddle::experimental::Tensor";
     }
 
-    return_contents[return_position] = output_varname;
-    generated_function_body += out_tensor_str;
+    if (!inplace_map.empty() && inplace_map.count(output_name)) {
+      // Replace output directly with input in inplace op.
+      return_contents[return_position] = inplace_map[output_name];
+    } else {
+      return_contents[return_position] = output_varname;
+    }
+    trace_op_body_str += out_tensor_str;
   }
-  generated_function_body += "\n";
+  trace_op_body_str += "\n";
   VLOG(6) << "Converted Output VarBase to EagerVariable(s)";
+  /* ------ END Generate TraceOp ----- */
 
   // [Generation] Handle core_ops_returns_info
-  core_ops_returns_info[op_type] = return_contents;
+  // avoid inplace op changing core_ops_returns_info
+  if (core_ops_returns_info.empty() || !core_ops_returns_info.count(op_type)) {
+    core_ops_returns_info[op_type] = return_contents;
+  }
 
   // [Generation] ComputeRequireGrad -> GradNodeCreation
+
   if (!bwd_info.GenerateForwardOnly()) {
-    std::string grad_node_creation_body_str =
-        GenerateGradNodeCreationContent(fwd_info, bwd_info);
+    // If GradNode needs to be generated, pass `trace_op_body_str`
+    // into `GenerateGradNodeCreationContent`.
+    std::string grad_node_creation_body_str = GenerateGradNodeCreationContent(
+        fwd_info, bwd_info, trace_op_body_str, inplace_map);
+
     generated_function_body += grad_node_creation_body_str;
     generated_function_body += "\n";
 
     // [Generation] Call RetainGradForTensor
     VLOG(6) << "Generated GradNode Creation codes";
+  } else {
+    // If GradNode doesn't need to be generated, generate TraceOP directly.
+    generated_function_body += trace_op_body_str;
   }
 
   // [Generation] Handle return: Tuple/Vector/Tensor
@@ -1607,17 +1781,33 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   VLOG(6) << "Generated return codes";
 
   // [Generation] Get Full Function
-  std::string function_name = op_type + "_dygraph_function";
+  std::string function_name;
+  if (inplace_map.empty()) {
+    function_name = op_type + "_dygraph_function";
+  } else {
+    // change function_name for inplace op.
+    function_name = op_type + "__dygraph_function";
+  }
 
   if (dygraph_function_args_str.size() > 0) {
     auto iter = dygraph_function_args_str.begin();
     if ((*iter) == ',') dygraph_function_args_str.erase(iter);
   }
 
-  const char* FWD_FUNCTION_TEMPLATE = "%s %s(%s) {\n\n%s\n}\n\n";
+  const char* DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE =
+      "  paddle::platform::RecordEvent dygraph_entrance_record_event(\"%s\", "
+      "paddle::platform::TracerEventType::Operator, 1);";
+  std::string event_name = op_type + " dygraph";
+  std::string fwd_record_event_str = paddle::string::Sprintf(
+      DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE, event_name);
+  const char* FWD_FUNCTION_TEMPLATE =
+      "%s %s(%s) {\n\n"
+      "%s\n"
+      "%s\n"
+      "}\n\n";
   std::string fwd_function_str = paddle::string::Sprintf(
       FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name,
-      dygraph_function_args_str, generated_function_body);
+      dygraph_function_args_str, fwd_record_event_str, generated_function_body);
 
   // [Generation] Generate forward functions header
   const char* FWD_HEADER_TEMPLATE = "%s %s(%s);\n";
@@ -1804,7 +1994,7 @@ static std::string GenerateSingleOpBase(
             !is_op_base_per_duplicable_input) {
           const char* GRAD_OUTS_CONTENT_TEMPLATE =
               "{ \"%s\", egr::EagerUtils::CreateVars( "
-              "this->OutputMeta()[%d].Size() ) },";
+              "this->OutputMeta()[%d].size() ) },";
           outs_contents_str += paddle::string::Sprintf(
               GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, fwd_input_position);
         } else {
@@ -1842,18 +2032,17 @@ static std::string GenerateSingleOpBase(
   const char* ATTRS_TEMPLATE = "  auto& %s = this->attr_map_;\n";
   std::string grad_attrs_str =
       paddle::string::Sprintf(ATTRS_TEMPLATE, attrs_name);
-  for (const auto& iter : grad_attrs) {
-    if (IgnoreGradAttribute(fwd_op_type, iter.first)) continue;
-    std::pair<std::string, std::string> type_val =
-        GetAttrType(iter.second, false /*is_arg*/);
-    const char* GRAD_ATTRS_TEMPLATE =
-        "  %s %s = %s;\n"
-        "  %s[\"%s\"] = %s;\n";
-    std::string var_name = iter.first + std::to_string(*outs_size);
-    grad_attrs_str += paddle::string::Sprintf(
-        GRAD_ATTRS_TEMPLATE, type_val.first, var_name, type_val.second,
-        attrs_name, iter.first, var_name);
-  }
+  if (fwd_op_type == "cast") {
+    // swtich in out dtype
+    const char* CAST_GRAD =
+        "  auto temp_type = %s[\"in_dtype\"];\n"
+        "  %s[\"in_dtype\"] = %s[\"out_dtype\"];\n"
+        "  %s[\"out_dtype\"] = temp_type;\n";
+    grad_attrs_str += paddle::string::Sprintf(CAST_GRAD, attrs_name, attrs_name,
+                                              attrs_name, attrs_name);
+  }
+  // Handle dynamic grad attributes
+  grad_attrs_str += HandleDynamicGradAttributes(fwd_op_type, attrs_name);
   generated_grad_function_body += grad_attrs_str;
 
   const char* TRACE_OP_TEMPLATE =
@@ -2032,7 +2221,7 @@ static std::string GenerateGradNodeCCContents(
 
   if (is_op_base_per_duplicable_input) {
     const char* OP_BASE_PER_DUP_INPUT_TEMPLATE =
-        "  for(int i = 0; i < this->OutputMeta()[0].Size(); i++) {\n"
+        "  for(size_t i = 0; i < this->OutputMeta()[0].size(); i++) {\n"
         "    %s\n"
         "  }\n";
     generated_grad_function_body = paddle::string::Sprintf(
@@ -2044,6 +2233,8 @@ static std::string GenerateGradNodeCCContents(
       "GradNode%s::ApplyGradientHooks(grads);\n"
       "  std::vector<std::vector<paddle::experimental::Tensor>> outputs(%d);\n"
       "  %s\n"
+      "  if(NeedComplexToRealConversion()) "
+      "HandleComplexGradToRealGrad(&outputs);\n"
       "  return outputs;\n";
   generated_grad_function_body =
       paddle::string::Sprintf(BWD_RETURN_TEMPLATE, fwd_op_type, in_vars.size(),
@@ -2053,7 +2244,8 @@ static std::string GenerateGradNodeCCContents(
   const char* GRAD_FUNCTION_TEMPLATE =
       "std::vector<std::vector<paddle::experimental::Tensor>> "
       "GradNode%s::operator()(const "
-      "std::vector<std::vector<paddle::experimental::Tensor>>& grads) {\n%s\n}";
+      "std::vector<std::vector<paddle::experimental::Tensor>>& grads, "
+      "bool create_graph) {\n%s\n}";
   std::string grad_function_str = paddle::string::Sprintf(
       GRAD_FUNCTION_TEMPLATE, fwd_op_type, generated_grad_function_body);
 
@@ -2088,18 +2280,28 @@ static std::string GenerateGradNodeHeaderContents(
       "\n"
       "  virtual std::vector<std::vector<paddle::experimental::Tensor>> "
       "operator()(const "
-      "std::vector<std::vector<paddle::experimental::Tensor>>& grads) "
+      "std::vector<std::vector<paddle::experimental::Tensor>>& grads, const "
+      "bool create_graph = false) "
       "override;\n"
       "\n"
+      "  void ClearTensorWrappers() override { \n"
+      "%s\n"
+      "    is_tensor_wrappers_cleared = true;\n"
+      "  }\n"
       "  std::string name() override { return \" GradNode%s \"; } \n "
       "\n"
       "  // SetX, SetY, ...\n"
       "%s\n"
       "  // SetAttrMap\n"
       "%s\n"
+      "  bool IsTensorWrappersCleared() override { \n"
+      "    return is_tensor_wrappers_cleared;\n"
+      "  }\n"
       " private:\n"
       "   // TensorWrappers\n"
       "%s\n"
+      "   bool is_tensor_wrappers_cleared = false;\n"
+      "\n"
       "   // Attribute Map\n"
       "%s\n"
       "};";
@@ -2133,6 +2335,7 @@ static std::string GenerateGradNodeHeaderContents(
 
   std::string set_tensor_wrappers_str = "";
   std::string tensor_wrapper_members_str = "";
+  std::string clear_tensor_wrappers_str = "";
   for (const auto& iter : op_base_infos) {
     const std::map<std::string, std::string>& grad_ins_fwd_slotname_map =
         iter.GetGradInsFwdSlotnameMap();
@@ -2164,6 +2367,13 @@ static std::string GenerateGradNodeHeaderContents(
             SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name,
             struct_tensor_wrapper_name);
 
+        const char* CLEAR_TENSOR_WRAPPER_TEMPLATE =
+            "for (auto tw: %s)   {\n"
+            "       tw.clear();\n"
+            "     }\n";
+        clear_tensor_wrappers_str += paddle::string::Sprintf(
+            CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name);
+
       } else {
         const char* ATTR_TENSOR_WRAPPER_ARG_TEMPLATE =
             "const paddle::experimental::Tensor& %s";
@@ -2176,10 +2386,14 @@ static std::string GenerateGradNodeHeaderContents(
             TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name);
 
         const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE =
-            "%s = egr::TensorWrapper(%s, %s /*full_reserved*/);";
+            "%s = egr::TensorWrapper(%s, %s /*full_reserved*/);\n";
         tensor_wrapper_body_str = paddle::string::Sprintf(
             SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name,
             tensor_wrapper_name, full_reserved_str);
+
+        const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = "   %s.clear();\n";
+        clear_tensor_wrappers_str += paddle::string::Sprintf(
+            CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name);
       }
       std::string full_reserved_signature_str = "bool full_reserved";
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
@@ -2194,8 +2408,8 @@ static std::string GenerateGradNodeHeaderContents(
 
   std::string grad_node_str = paddle::string::Sprintf(
       GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type,
-      op_type, op_type, set_tensor_wrappers_str, set_attr_map_str,
-      tensor_wrapper_members_str, attr_members_str);
+      op_type, clear_tensor_wrappers_str, op_type, set_tensor_wrappers_str,
+      set_attr_map_str, tensor_wrapper_members_str, attr_members_str);
 
   return grad_node_str;
 }
@@ -2240,8 +2454,9 @@ static void GenerateForwardDygraphFile(const std::string& forward_cc_path,
       "\"paddle/fluid/eager/api/generated/fluid_generated/"
       "dygraph_forward_api.h\"\n"
       "#include "
-      "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n\n"
-      "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n";
+      "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n"
+      "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"
+      "#include \"paddle/fluid/platform/profiler/event_tracing.h\"\n\n";
   std::string forward_cc_include_str =
       paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE);
   std::ofstream forward_cc_stream(forward_cc_path, std::ios::out);
@@ -2379,7 +2594,7 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
     /* --------------------------- */
     VLOG(6) << "-------- GenerateForwardFunctionContents -------";
     std::pair<std::string, std::string> body_and_declaration =
-        GenerateForwardFunctionContents(fwd_info, bwd_info);
+        GenerateForwardFunctionContents(fwd_info, bwd_info, {});
 
     fwd_function_str += body_and_declaration.first + "\n";
 
@@ -2387,6 +2602,30 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
     std::string fwd_function_declare_str = body_and_declaration.second;
     dygraph_forward_api_str += fwd_function_declare_str;
 
+    auto& infer_inplace =
+        paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_;
+    std::map<std::string, std::string> inplace_map;
+    // Inplace Function Generator.
+    // `sum` op has duplicate input. Don't consider adding inplace strategy
+    // for `sum` in temporary.
+    if (op_type != "sum" && infer_inplace) {
+      auto in_to_outs = infer_inplace(true);
+      for (auto& inplace_pair : in_to_outs) {
+        inplace_map[inplace_pair.second] = inplace_pair.first;
+      }
+
+      VLOG(6) << "-------- GenerateInplaceForwardFunctionContents -------";
+      std::pair<std::string, std::string> inplace_body_and_declaration =
+          GenerateForwardFunctionContents(fwd_info, bwd_info, inplace_map);
+
+      fwd_function_str += inplace_body_and_declaration.first + "\n";
+
+      VLOG(6) << "-------- GenerateInplaceDygraphForwardAPIContents -------";
+      std::string inplace_fwd_function_declare_str =
+          inplace_body_and_declaration.second;
+      dygraph_forward_api_str += inplace_fwd_function_declare_str;
+    }
+
     if (bwd_info.GenerateForwardOnly()) continue;
 
     VLOG(6) << "-------- GenerateGradNodeHeaderContents -------";
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
index c6bca01205e19..771351dd4affb 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml")
-set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml")
+set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml")
+set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml")
 set(tmp_forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc")
 set(tmp_forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.h")
 set(tmp_nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.cc")
@@ -27,6 +27,7 @@ add_custom_target(eager_final_state_codegen
 
 set(tmp_python_c_output_path "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h")
 set(python_c_output_path "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_final_state_op_function_impl.h")
+
 add_custom_target(eager_final_state_python_c_codegen
     COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py" 
             "--api_yaml_path=${api_yaml_path}"
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index f56cf8ef24cf6..1685b6f3cb5c3 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -23,10 +23,12 @@
 core_ops_args_info = {}
 core_ops_args_type_info = {}
 
+namespace = ""
 
 yaml_types_mapping = {
     'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t',  'size_t' : 'size_t', \
     'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
+    'str' : 'std::string', \
     'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
     'int64[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
     'Tensor' : 'Tensor',
@@ -125,6 +127,7 @@ def GetAutoGradMetaVectorName(string):
 def ReadFwdFile(filepath):
     f = open(filepath, 'r')
     contents = yaml.load(f, Loader=yaml.FullLoader)
+    f.close()
     return contents
 
 
@@ -133,15 +136,25 @@ def ReadBwdFile(filepath):
     contents = yaml.load(f, Loader=yaml.FullLoader)
     ret = {}
     for content in contents:
-        assert 'backward_api' in content.keys()
-        api_name = content['backward_api']
+        if 'backward_api' in content.keys():
+            api_name = content['backward_api']
+        else:
+            assert False
+
         ret[api_name] = content
+    f.close()
     return ret
 
 
 ######################
 ###  Yaml Parsers  ###
 ######################
+def RemoveSpecialSymbolsInName(string):
+    # Remove any name after '@'
+    ret = string.split("@")[0]
+    return ret
+
+
 def IntermediateValidationCheck(intermediate_outputs, forward_returns_list):
     # intermediate_outputs : [name0, name1, ...]
     # forward_returns_list : [[ret_name, type, orig_pos], ...]
@@ -160,15 +173,19 @@ def IntermediateValidationCheck(intermediate_outputs, forward_returns_list):
 
 def ParseDispensable(string):
     # string: "X, Y"
+    string = RemoveSpecialSymbolsInName(string)
     return [v.strip() for v in string.split(",")]
 
 
 def ParseIntermediate(string):
+    string = RemoveSpecialSymbolsInName(string)
     return [v.strip() for v in string.split(",")]
 
 
 def ParseNoNeedBuffer(string):
     # string: "x, y"
+    string = RemoveSpecialSymbolsInName(string)
+
     no_need_buffer_set = set()
     for name in string.split(","):
         no_need_buffer_set.add(name.strip())
@@ -196,8 +213,11 @@ def ParseYamlArgs(string):
         default_value = m.group(3).split("=")[1].strip() if len(
             m.group(3).split("=")) > 1 else None
 
-        assert arg_type in yaml_types_mapping.keys()
+        assert arg_type in yaml_types_mapping.keys(
+        ), f"The argument type {arg_type} in yaml config is not supported in yaml_types_mapping."
         arg_type = yaml_types_mapping[arg_type]
+
+        arg_name = RemoveSpecialSymbolsInName(arg_name)
         if "Tensor" in arg_type:
             assert default_value is None
             inputs_list.append([arg_name, arg_type, i])
@@ -229,10 +249,12 @@ def ParseYamlReturns(string):
         else:
             ret_type = ret.strip()
 
-        assert ret_type in yaml_types_mapping.keys()
+        assert ret_type in yaml_types_mapping.keys(
+        ), f"The return type {ret_type} in yaml config is not supported in yaml_types_mapping."
         ret_type = yaml_types_mapping[ret_type]
 
         assert "Tensor" in ret_type
+        ret_name = RemoveSpecialSymbolsInName(ret_name)
         returns_list.append([ret_name, ret_type, i])
 
     return returns_list
@@ -456,6 +478,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
     # SetTensorWrapper Methods & TensorWrapper Members
     set_tensor_wrapper_methods_str = ""
     tensor_wrapper_members_str = ""
+    clear_tensor_wrapper_str = ""
     for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items():
         if tname in no_need_buffer_set:
             no_need_buffer = "true"
@@ -477,6 +500,13 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
 """
             tensor_wrapper_members_str += PLAIN_TENSOR_MEMBER_TEMPLATE.format(
                 tensor_wrapper_name)
+
+            CLEAR_TENSOR_WRAPPERS_TEMPLATE = """
+   {}.clear();
+"""
+            clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format(
+                tensor_wrapper_name)
+
         else:
             assert IsVectorTensorType(ttype)
             SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """
@@ -494,6 +524,15 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
 """
             tensor_wrapper_members_str += VECTOR_TENSOR_MEMBER_TEMPLATE.format(
                 tensor_wrapper_name)
+
+            CLEAR_TENSOR_WRAPPERS_TEMPLATE = """
+   for (auto tw: {}) {
+     tw.clear();
+   };
+"""
+            clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format(
+                tensor_wrapper_name)
+
     # End: SetTensorWrapper Methods & TensorWrapper Members
 
     # SetAttributes & Attribute Members
@@ -502,7 +541,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
     for aname, atype, default_val, _ in backward_attrs_list:
         saved_attr_name = GetSavedName(aname)
         SET_ATTR_METHOD_TEMPLATE = """
-   void SetAttribute{}({} {}) {{     
+   void SetAttribute{}({} {}) {{
      {} = {};
    }}
 """
@@ -533,25 +572,37 @@ class {} : public egr::GradNodeBase {{
   ~{}() override = default;
 
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads) override;
+      const std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph = false) override;
   std::string name() override {{ return \" {} \"; }}
+  
+  void ClearTensorWrappers() override {{
+      {}
+    is_tensor_wrappers_cleared = true;
+  }}
+  
   // SetTensorWrapperX, SetTensorWrapperY, ...
   {}
   // SetAttributes
   {}
+
+  bool IsTensorWrappersCleared() override {{
+      return is_tensor_wrappers_cleared;  
+  }}
  private:
   // TensorWrappers
   {}
 
+  bool is_tensor_wrappers_cleared = false;
+
   // Attributes
   {}
 }};
 """
     node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
         grad_node_name, grad_node_name, grad_node_name, grad_node_name,
-        grad_node_name, set_tensor_wrapper_methods_str,
-        set_attribute_methods_str, tensor_wrapper_members_str,
-        attribute_members_str)
+        grad_node_name, clear_tensor_wrapper_str,
+        set_tensor_wrapper_methods_str, set_attribute_methods_str,
+        tensor_wrapper_members_str, attribute_members_str)
 
     return node_declaration_str
 
@@ -605,19 +656,27 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
         else:
             # Rearrange output order accordingly
             returns_str += f"returns[{fwd_position}] =  grad_api_returns[{grad_api_position}];\n"
+    returns_str += f"if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n"
     returns_str += f"return returns;\n"
 
     grad_node_name = GetGradNodeName(fwd_api_name)
+
+    if len(namespace) > 0:
+        grad_api_namespace = f"paddle::experimental::{namespace}"
+    else:
+        grad_api_namespace = f"paddle::experimental"
+
     FUNCTION_TEMPLATE = """
-std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {{
+std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph) {{
     // Call grad_api function
-    auto grad_api_returns = paddle::experimental::{}({});
+    auto grad_api_returns = {}::{}({});
     {}
 }}
   """
 
     node_definition_str = FUNCTION_TEMPLATE.format(
-        grad_node_name, bwd_api_name, grad_api_args_str, returns_str)
+        grad_node_name, grad_api_namespace, bwd_api_name, grad_api_args_str,
+        returns_str)
 
     return node_definition_str
 
@@ -671,7 +730,7 @@ def GenerateNodeCreationCodes(
         else:
             # Tuple api_result
             if IsPlainTensorType(rtype):
-                output_autograd_meta = f"    egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);"
+                output_autograd_meta = f"    egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));"
             else:
                 assert IsVectorTensorType(rtype)
                 output_autograd_meta = f"    std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);\n"
@@ -699,18 +758,27 @@ def GenerateNodeCreationCodes(
 
     # SetTensorWrappers
     set_tensor_wrappers_list = []
-    for name, (_, is_fwd_input, _) in backward_fwd_input_map.items():
+    for name, (atype, is_fwd_input, pos) in backward_fwd_input_map.items():
         is_optional = (name in optional_inputs)
+
         if is_fwd_input:
             if is_optional:
                 set_tensor_wrappers = f"        if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);"
             else:
                 set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, true);"
         else:
+            if num_fwd_outputs > 1:
+                # Aligned with forward output position
+                assert name in forward_outputs_position_map.keys()
+                fwd_output_pos = forward_outputs_position_map[name][1]
+                tw_name = f"std::get<{fwd_output_pos}>(api_result)"
+            else:
+                tw_name = f"api_result"
+
             if is_optional:
-                set_tensor_wrappers = f"        if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, false);"
+                set_tensor_wrappers = f"        if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);"
             else:
-                set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, false);"
+                set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({tw_name}, false);"
         set_tensor_wrappers_list.append(set_tensor_wrappers)
     set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list)
 
@@ -719,7 +787,7 @@ def GenerateNodeCreationCodes(
     set_edges_list = []
     for name, (_, pos) in forward_inputs_position_map.items():
         input_autograd_meta_name = GetAutoGradMetaName(name)
-        set_grad_out_meta = f"        grad_node->SetGradOutMeta({input_autograd_meta_name}, {pos});"
+        set_grad_out_meta = f"        grad_node->SetGradOutMeta({name}, {pos});"
         set_edges = f"        grad_node->AddEdges({input_autograd_meta_name}, {pos});"
         set_grad_out_meta_list.append(set_grad_out_meta)
         set_edges_list.append(set_edges)
@@ -736,17 +804,18 @@ def GenerateNodeCreationCodes(
         output_autograd_meta_name = GetAutoGradMetaName(name)
         set_out_rank = f"        egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});"
         set_history = f"        egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);"
-        set_grad_in_meta = f"        grad_node->SetGradInMeta({output_autograd_meta_name}, {pos});"
+        if num_outputs == 1:
+            set_retain_grad = f"        egr::EagerUtils::CheckAndRetainGrad(api_result);"
+            set_grad_in_meta = f"        grad_node->SetGradInMeta(api_result, {pos});"
+        else:
+            set_retain_grad = f"        egr::EagerUtils::CheckAndRetainGrad(std::get<{pos}>(api_result));"
+            set_grad_in_meta = f"        grad_node->SetGradInMeta(std::get<{pos}>(api_result), {pos});"
 
         set_out_rank_list.append(set_out_rank)
         set_history_list.append(set_history)
         set_grad_in_meta_list.append(set_grad_in_meta)
-
-        if num_outputs == 1:
-            set_retain_grad = f"        egr::EagerUtils::CheckAndRetainGrad(api_result);"
-        else:
-            set_retain_grad = f"        egr::EagerUtils::CheckAndRetainGrad(api_result[{pos}]);"
         set_retain_grad_list.append(set_retain_grad)
+
     set_out_rank_str = "\n".join(set_out_rank_list)
     set_history_str = "\n".join(set_history_list)
     set_grad_in_meta_str = "\n".join(set_grad_in_meta_list)
@@ -850,7 +919,11 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
         function_name = fwd_api_name
     else:
         function_name = fwd_api_name + "_intermediate"
-    forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});"
+
+    if len(namespace) > 0:
+        forward_call_str = f"auto api_result = paddle::experimental::{namespace}::{function_name}({inputs_call_args_str});"
+    else:
+        forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});"
 
     # Get return type list & outputs
     num_outputs = len(forward_outputs_position_map.keys()) - len(
@@ -864,7 +937,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
             returns_list[0] = f"api_result"
         else:
             # Tuple api_result
-            returns_list[pos] = f"api_result[{pos}]"
+            returns_list[pos] = f"std::get<{pos}>(api_result)"
 
         if IsPlainTensorType(rtype):
             returns_type_list[pos] = "paddle::experimental::Tensor"
@@ -887,8 +960,20 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
         backward_fwd_input_map, backward_grad_input_map,
         backward_grad_output_map, backward_attrs_list, optional_inputs)
 
+    node_event_name = fwd_api_name + " node_creation"
+    NODE_CREATION_TEMPLATE = """{{\n
+           paddle::platform::RecordEvent node_creation_record_event(\"{}\", paddle::platform::TracerEventType::Operator, 1);\n
+           {}\n
+        }}"""
+    node_creation_str = NODE_CREATION_TEMPLATE.format(node_event_name,
+                                                      node_creation_str)
+
+    dygraph_event_str = f"paddle::platform::RecordEvent dygraph_entrance_record_event(\"{fwd_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);"
+
     FORWARD_FUNCTION_TEMPLATE = """
 {} {}({}) {{
+    {}
+
     // Forward API Call
     {}
     
@@ -902,7 +987,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
     forward_function_name = GetForwardFunctionName(fwd_api_name)
     forward_function_str = FORWARD_FUNCTION_TEMPLATE.format(
         returns_type_str, forward_function_name, inputs_args_definition_str,
-        forward_call_str, node_creation_str, returns_str)
+        dygraph_event_str, forward_call_str, node_creation_str, returns_str)
     forward_function_declaration_str = f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});"
 
     return forward_function_str, forward_function_declaration_str
@@ -1002,6 +1087,7 @@ def GenerateNodeCCFile(filepath, node_definition_str):
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
 #include "paddle/fluid/eager/to_static/run_program_op_node.h"
 
+#include "paddle/phi/api/backward/sparse_bw_api.h"
 """
     file_contents += node_definition_str
     with open(filepath, 'a') as f:
@@ -1022,10 +1108,13 @@ def GenerateNodeHFile(filepath, node_declaration_str):
 
 def GenerateForwardCCFile(filepath, forward_definition_str):
     file_contents = """
+#include "paddle/phi/api/lib/dygraph_api.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
 
+#include "paddle/phi/api/include/sparse_api.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 """
 
@@ -1055,134 +1144,184 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
 if __name__ == "__main__":
     args = ParseArguments()
 
-    api_yaml_path = args.api_yaml_path
-    backward_yaml_path = args.backward_yaml_path
-
-    fwd_api_list = ReadFwdFile(api_yaml_path)
-    grad_api_dict = ReadBwdFile(backward_yaml_path)
+    api_yaml_paths = args.api_yaml_path.split(",")
+    backward_yaml_paths = args.backward_yaml_path.split(",")
 
     # Generate per Dygraph API
     node_declaration_str = ""
     node_definition_str = ""
     forward_definition_str = ""
     forward_declaration_str = ""
-    for fwd_api in fwd_api_list:
-        # We only generate Ops with grad
-        if 'backward' not in fwd_api.keys():
-            continue
 
-        assert 'api' in fwd_api.keys()
-        assert 'args' in fwd_api.keys()
-        assert 'output' in fwd_api.keys()
-        assert 'backward' in fwd_api.keys()
-
-        no_need_buffer_set = set()
-        if 'no_need_buffer' in fwd_api.keys():
-            no_need_buffer_set = ParseNoNeedBuffer(fwd_api['no_need_buffer'])
-
-        fwd_api_name = fwd_api['api']
-        fwd_args_str = fwd_api['args']
-        fwd_returns_str = fwd_api['output']
-
-        bwd_api_name = fwd_api['backward']
-        assert bwd_api_name in grad_api_dict.keys()
-        bwd_api = grad_api_dict[bwd_api_name]
-
-        assert 'args' in bwd_api.keys()
-        assert 'output' in bwd_api.keys()
-        assert 'forward' in bwd_api.keys()
-
-        # Parse Dispensable Inputs
-        optional_inputs = []
-        if 'optional' in fwd_api.keys():
-            optional_inputs = ParseDispensable(fwd_api['optional'])
-
-        bwd_forward_str = bwd_api['forward']
-        bwd_args_str = bwd_api['args']
-        bwd_returns_str = bwd_api['output']
-
-        # Collect Forward Inputs/Outputs
-        forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward(
-            bwd_forward_str)
-        print("Parsed Forward Inputs List: ", forward_inputs_list)
-        print("Prased Forward Attrs List: ", forward_attrs_list)
-        print("Parsed Forward Returns List: ", forward_returns_list)
-
-        intermediate_outputs = []
-        if 'intermediate' in fwd_api.keys():
-            intermediate_outputs = ParseIntermediate(fwd_api['intermediate'])
-
-        IntermediateValidationCheck(intermediate_outputs, forward_returns_list)
-
-        # Collect Original Forward Inputs/Outputs and then perform validation checks
-        orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward(
-            fwd_args_str, fwd_returns_str)
-        print("Parsed Original Forward Inputs List: ", orig_forward_inputs_list)
-        print("Prased Original Forward Attrs List: ", orig_forward_attrs_list)
-        print("Parsed Original Forward Returns List: ",
-              orig_forward_returns_list)
-
-        # Forward Validation Checks
-        ForwardsValidationCheck(forward_inputs_list, forward_attrs_list,
-                                forward_returns_list, orig_forward_inputs_list,
-                                orig_forward_attrs_list,
-                                orig_forward_returns_list)
-
-        # Parse Backward Inputs/Outputs
-        backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward(
-            bwd_args_str, bwd_returns_str)
-        print("Parsed Backward Inputs List: ", backward_inputs_list)
-        print("Prased Backward Attrs List: ", backward_attrs_list)
-        print("Parsed Backward Returns List: ", backward_returns_list)
-
-        # Determine Forward Inputs/Outputs Position
-        forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
-            forward_inputs_list, forward_returns_list)
-        print("Generated Forward Input Position Map: ",
-              forward_inputs_position_map)
-        print("Generated Forward Output Position Map: ",
-              forward_outputs_position_map)
-
-        # SlotName Matching
-        backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching(
-            backward_inputs_list, backward_returns_list,
-            forward_inputs_position_map, forward_outputs_position_map)
-        print("Generated Backward Fwd Input Map: ", backward_fwd_input_map)
-        print("Generated Backward Grad Input Map: ", backward_grad_input_map)
-        print("Generated Backward Grad Output Map: ", backward_grad_output_map)
-
-        # Backward Validation Check
-        BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map,
-                                backward_attrs_list)
-
-        # Node Declaration Generation
-        node_declaration_str += GenerateNodeDeclaration(
-            fwd_api_name, backward_fwd_input_map, backward_attrs_list,
-            no_need_buffer_set)
-        print("Generated Node Declaration: ", node_declaration_str)
-
-        node_definition_str += GenerateNodeDefinition(
-            fwd_api_name, bwd_api_name, backward_fwd_input_map,
-            backward_grad_input_map, backward_grad_output_map,
-            backward_attrs_list)
-        print("Generated Node Definition: ", node_definition_str)
-
-        # Node Definition Generation
-        definition_declaration_pair = GenerateForwardDefinition(
-            fwd_api_name, bwd_api_name, forward_inputs_position_map,
-            forward_outputs_position_map, forward_attrs_list,
-            backward_fwd_input_map, backward_grad_input_map,
-            backward_grad_output_map, backward_attrs_list, optional_inputs,
-            intermediate_outputs)
-        print("Generated Forward Definition: ", forward_definition_str)
-        print("Generated Forward Declaration: ", forward_declaration_str)
-        forward_definition_str += definition_declaration_pair[0]
-        forward_declaration_str += definition_declaration_pair[1]
-
-        # For python-level API dispatch
-        CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
-                                  forward_outputs_position_map,
-                                  forward_attrs_list)
+    for i in range(len(api_yaml_paths)):
+        api_yaml_path = api_yaml_paths[i]
+        backward_yaml_path = backward_yaml_paths[i]
+
+        if "sparse" in api_yaml_path:
+            assert "sparse" in backward_yaml_path
+            namespace = "sparse"
+        else:
+            namespace = ""
+
+        fwd_api_list = ReadFwdFile(api_yaml_path)
+        grad_api_dict = ReadBwdFile(backward_yaml_path)
+
+        yaml_forward_definition_str = ""
+        yaml_forward_declaration_str = ""
+        yaml_node_declaration_str = ""
+        yaml_node_definition_str = ""
+        for fwd_api in fwd_api_list:
+            # We only generate Ops with grad
+            if 'backward' not in fwd_api.keys():
+                continue
+
+            assert 'api' in fwd_api.keys()
+            assert 'args' in fwd_api.keys()
+            assert 'output' in fwd_api.keys()
+            assert 'backward' in fwd_api.keys()
+
+            no_need_buffer_set = set()
+            if 'no_need_buffer' in fwd_api.keys():
+                no_need_buffer_set = ParseNoNeedBuffer(fwd_api[
+                    'no_need_buffer'])
+
+            fwd_api_name = fwd_api['api']
+            fwd_args_str = fwd_api['args']
+            fwd_returns_str = fwd_api['output']
+
+            bwd_api_name = fwd_api['backward']
+            assert bwd_api_name in grad_api_dict.keys()
+            bwd_api = grad_api_dict[bwd_api_name]
+
+            assert 'args' in bwd_api.keys()
+            assert 'output' in bwd_api.keys()
+            assert 'forward' in bwd_api.keys()
+
+            # Parse Dispensable Inputs
+            optional_inputs = []
+            if 'optional' in fwd_api.keys():
+                optional_inputs = ParseDispensable(fwd_api['optional'])
+
+            bwd_forward_str = bwd_api['forward']
+            bwd_args_str = bwd_api['args']
+            bwd_returns_str = bwd_api['output']
+
+            # Collect Forward Inputs/Outputs
+            forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward(
+                bwd_forward_str)
+            print("Parsed Forward Inputs List: ", forward_inputs_list)
+            print("Prased Forward Attrs List: ", forward_attrs_list)
+            print("Parsed Forward Returns List: ", forward_returns_list)
+
+            intermediate_outputs = []
+            if 'intermediate' in fwd_api.keys():
+                intermediate_outputs = ParseIntermediate(fwd_api[
+                    'intermediate'])
+
+            IntermediateValidationCheck(intermediate_outputs,
+                                        forward_returns_list)
+
+            # Collect Original Forward Inputs/Outputs and then perform validation checks
+            orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward(
+                fwd_args_str, fwd_returns_str)
+            print("Parsed Original Forward Inputs List: ",
+                  orig_forward_inputs_list)
+            print("Prased Original Forward Attrs List: ",
+                  orig_forward_attrs_list)
+            print("Parsed Original Forward Returns List: ",
+                  orig_forward_returns_list)
+
+            # Forward Validation Checks
+            ForwardsValidationCheck(
+                forward_inputs_list, forward_attrs_list, forward_returns_list,
+                orig_forward_inputs_list, orig_forward_attrs_list,
+                orig_forward_returns_list)
+
+            # Parse Backward Inputs/Outputs
+            backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward(
+                bwd_args_str, bwd_returns_str)
+            print("Parsed Backward Inputs List: ", backward_inputs_list)
+            print("Prased Backward Attrs List: ", backward_attrs_list)
+            print("Parsed Backward Returns List: ", backward_returns_list)
+
+            # Determine Forward Inputs/Outputs Position
+            forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
+                forward_inputs_list, forward_returns_list)
+            print("Generated Forward Input Position Map: ",
+                  forward_inputs_position_map)
+            print("Generated Forward Output Position Map: ",
+                  forward_outputs_position_map)
+
+            # SlotName Matching
+            backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching(
+                backward_inputs_list, backward_returns_list,
+                forward_inputs_position_map, forward_outputs_position_map)
+            print("Generated Backward Fwd Input Map: ", backward_fwd_input_map)
+            print("Generated Backward Grad Input Map: ",
+                  backward_grad_input_map)
+            print("Generated Backward Grad Output Map: ",
+                  backward_grad_output_map)
+
+            # Backward Validation Check
+            BackwardValidationCheck(backward_fwd_input_map,
+                                    backward_grad_input_map,
+                                    backward_attrs_list)
+
+            # Node Declaration Generation
+            yaml_node_declaration_str += GenerateNodeDeclaration(
+                fwd_api_name, backward_fwd_input_map, backward_attrs_list,
+                no_need_buffer_set)
+            print("Generated Node Declaration: ", node_declaration_str)
+
+            yaml_node_definition_str += GenerateNodeDefinition(
+                fwd_api_name, bwd_api_name, backward_fwd_input_map,
+                backward_grad_input_map, backward_grad_output_map,
+                backward_attrs_list)
+            print("Generated Node Definition: ", node_definition_str)
+
+            # Node Definition Generation
+            definition_declaration_pair = GenerateForwardDefinition(
+                fwd_api_name, bwd_api_name, forward_inputs_position_map,
+                forward_outputs_position_map, orig_forward_attrs_list,
+                backward_fwd_input_map, backward_grad_input_map,
+                backward_grad_output_map, backward_attrs_list, optional_inputs,
+                intermediate_outputs)
+            print("Generated Forward Definition: ", forward_definition_str)
+            print("Generated Forward Declaration: ", forward_declaration_str)
+            yaml_forward_definition_str += definition_declaration_pair[0]
+            yaml_forward_declaration_str += definition_declaration_pair[1]
+
+            # For python-level API dispatch
+            CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
+                                      forward_outputs_position_map,
+                                      orig_forward_attrs_list)
+
+        if len(namespace) > 0:
+            forward_definition_str += f"""namespace {namespace} {{
+    {yaml_forward_definition_str}
+}}
+"""
+
+            forward_declaration_str += f"""namespace {namespace} {{
+    {yaml_forward_declaration_str}
+}}
+"""
+
+            node_declaration_str += f"""namespace {namespace} {{
+    {yaml_node_declaration_str}
+}}
+"""
+
+            node_definition_str += f"""namespace {namespace} {{
+    {yaml_node_definition_str}
+}}
+"""
+
+        else:
+            forward_definition_str += yaml_forward_definition_str
+            forward_declaration_str += yaml_forward_declaration_str
+            node_declaration_str += yaml_node_declaration_str
+            node_definition_str += yaml_node_definition_str
 
     # Generate Files
     nodes_h_path = args.nodes_h_path
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index d0506e45eb476..e1c2cf871ea42 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -14,9 +14,18 @@
 
 import os
 import argparse
-from eager_gen import yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
+import logging
+from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
+
+###########################
+## Global Configurations ##
+###########################
+skipped_forward_api_names = set(["scale"])
+
+
+def SkipAPIGeneration(forward_api_name):
+    return (forward_api_name in skipped_forward_api_names)
 
-skipped_fwd_api_names = set(["scale"])
 
 atype_to_parsing_function = {
     "bool": "CastPyArg2Boolean",
@@ -24,7 +33,7 @@
     "long": "CastPyArg2Long",
     "int64_t": "CastPyArg2Long",
     "float": "CastPyArg2Float",
-    "string": "CastPyArg2String",
+    "std::string": "CastPyArg2String",
     "std::vector<bool>": "CastPyArg2Booleans",
     "std::vector<int>": "CastPyArg2Ints",
     "std::vector<long>": "CastPyArg2Longs",
@@ -39,64 +48,35 @@
 }
 
 
-def ParseArguments():
-    parser = argparse.ArgumentParser(
-        description='Eager Code Generator Args Parser')
-    parser.add_argument('--api_yaml_path', type=str)
-    parser.add_argument('--output_path', type=str)
-
-    args = parser.parse_args()
-    return args
-
-
 def FindParsingFunctionFromAttributeType(atype):
     if atype not in atype_to_parsing_function.keys():
-        print(f"Unable to find {atype} in atype_to_parsing_function.")
-        assert False
+        assert False, f"Unable to find {atype} in atype_to_parsing_function."
 
     return atype_to_parsing_function[atype]
 
 
-def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map,
-                            forward_attrs_list, forward_outputs_position_map,
-                            optional_inputs, is_forward_only):
-    # forward_inputs_position_map = { "name" : [type, fwd_position] }
-    # forward_outputs_position_map = { "name" : [type, fwd_position] }
-    # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
-    # optional_inputs = [name0, ...]
-
-    # Get EagerTensor from args
-    # Get dygraph function call args
-    num_args = len(forward_inputs_position_map.keys()) + len(forward_attrs_list)
-    num_input_tensors = len(forward_inputs_position_map.keys())
-    dygraph_function_call_list = ["" for i in range(num_args)]
-    get_eager_tensor_str = ""
-    for name, (ttype, pos) in forward_inputs_position_map.items():
-        is_optional = (name in optional_inputs)
-        if IsVectorTensorType(ttype):
-            get_eager_tensor_str += f"    auto {name} = GetTensorListFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
-        else:
-            if is_optional:
-                get_eager_tensor_str += f"    auto {name} = GetOptionalTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
-            else:
-                get_eager_tensor_str += f"    auto {name} = GetTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
-        dygraph_function_call_list[pos] = f"{name}"
+##########################
+## Refactored Functions ##
+##########################
+PARSE_PYTHON_C_TENSORS_TEMPLATE = \
+"    auto {} = {}(\"{}\", \"{}\", args, {}, false);\n"
+
 
-    parse_attributes_str = ""
-    # Get Attributes
-    for name, atype, _, pos in forward_attrs_list:
-        parsing_function = FindParsingFunctionFromAttributeType(atype)
-        key = f"{name}"
+PARSE_PYTHON_C_ARGS_TEMPLATE = \
+"""    PyObject* {}_obj = PyTuple_GET_ITEM(args, {});\n
+     {} {} = {}({}_obj, \"{}\", {});\n"""
 
-        parse_attributes_str += f"    PyObject* {name}_obj = PyTuple_GET_ITEM(args, {pos});\n"
-        parse_attributes_str += f"    {atype} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n"
 
-        dygraph_function_call_list[pos] = f"{name}"
-    dygraph_function_call_str = ",".join(dygraph_function_call_list)
+RECORD_EVENT_TEMPLATE = \
+"    paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::Operator, 1);"
 
-    PYTHON_C_FUNCTION_TEMPLATE = """
+
+PYTHON_C_FUNCTION_TEMPLATE = \
+"""
 static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObject *kwargs)
 {{
+  {}
+
   PyThreadState *tstate = nullptr;
   try
   {{
@@ -126,22 +106,50 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map,
 }}
 
 """
-    if is_forward_only:
-        fwd_function_name = fwd_api_name
-    else:
-        fwd_function_name = GetForwardFunctionName(fwd_api_name)
 
-    python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
-        fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str,
-        fwd_function_name, dygraph_function_call_str)
 
-    python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n"
+FUNCTION_NAME_TEMPLATE = \
+"{}{}{}"
 
-    return python_c_function_str, python_c_function_reg_str
 
+PYTHON_C_FUNCTION_REG_TEMPLATE = \
+"{{\"final_state_{}\", (PyCFunction)(void(*)(void)) {}eager_final_state_api_{}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {} in dygraph.\"}}"
 
-def GenerateCoreOpsInfoMap():
-    result = """
+
+PYTHON_C_WRAPPER_TEMPLATE = \
+"""
+#pragma once
+
+#include  "pybind11/detail/common.h"
+#include  "paddle/phi/api/all.h"
+#include  "paddle/phi/api/lib/dygraph_api.h"
+#include  "paddle/phi/common/backend.h"
+#include  "paddle/phi/common/data_type.h"
+#include  "paddle/phi/common/scalar.h"
+#include  "paddle/phi/common/scalar_array.h"
+#include  "paddle/phi/api/include/sparse_api.h"
+#include  "paddle/fluid/pybind/op_function_common.h"
+#include  "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
+#include  "paddle/fluid/pybind/exception.h"
+#include  "paddle/fluid/platform/profiler/event_tracing.h"
+#include  <Python.h>
+
+namespace paddle {{
+namespace pybind {{
+
+{}
+
+static PyMethodDef EagerFinalStateMethods[] = {{
+    {}
+}};
+
+}} // namespace pybind
+}} // namespace paddle
+"""
+
+
+CORE_OPS_INFO = \
+"""
 static PyObject * eager_get_final_state_core_ops_args_info(PyObject *self) {
     PyThreadState *tstate = nullptr;
     try
@@ -186,10 +194,12 @@ def GenerateCoreOpsInfoMap():
       return nullptr;
     }
 }
-    """
+"""
 
-    core_ops_infos_registry = """
-    ,{\"get_final_state_core_ops_args_info\",
+
+CORE_OPS_INFO_REGISTRY = \
+"""
+    {\"get_final_state_core_ops_args_info\",
     (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS,
     \"C++ interface function for eager_get_final_state_core_ops_args_info.\"},
     {\"get_final_state_core_ops_args_type_info\",
@@ -201,7 +211,259 @@ def GenerateCoreOpsInfoMap():
     METH_NOARGS, \"C++ interface function for eager_get_final_state_core_ops_returns_info.\"},
 """
 
-    return result, core_ops_infos_registry
+NAMESPACE_WRAPPER_TEMPLATE = \
+"""namespace {} {{
+    {}
+}}
+"""
+
+
+#######################
+## Generator Classes ##
+#######################
+class PythonCSingleFunctionGenerator:
+    def __init__(self, fwd_api_contents, namespace):
+        self.fwd_api_contents = fwd_api_contents
+        self.namespace = namespace
+
+        # Raw Contents
+        self.forward_api_name = ""
+        self.forward_args_str = ""
+        self.forward_returns_str = ""
+
+        # Raw Data
+        self.forward_attrs_list = None  #[ [attr_name, attr_type, default_value, orig_position], ...]
+        self.forward_inputs_list = None  #[ [arg_name, arg_type, orig_position], ...]
+        self.forward_returns_list = None  #[ [ret_name, ret_type, orig_position], ...]
+
+        # Processed Data
+        self.forward_inputs_position_map = None  #{ "name" : [type, fwd_position] }
+        self.forward_outputs_position_map = None  #{ "name" : [type, fwd_position] }
+
+        # Special Op Attributes
+        self.optional_inputs = []  #[name, ...]
+        self.is_forward_only = True
+
+        # Generated Results
+        self.python_c_function_str = ""
+        self.python_c_function_reg_str = ""
+
+    def CollectRawContents(self):
+        fwd_api_contents = self.fwd_api_contents
+
+        assert 'api' in fwd_api_contents.keys(
+        ), "Unable to find \"api\" in fwd_api_contents keys"
+        assert 'args' in fwd_api_contents.keys(
+        ), "Unable to find \"args\" in fwd_api_contents keys"
+        assert 'output' in fwd_api_contents.keys(
+        ), "Unable to find \"output\" in fwd_api_contents keys"
+
+        self.forward_api_name = fwd_api_contents['api']
+        self.forward_args_str = fwd_api_contents['args']
+        self.forward_returns_str = fwd_api_contents['output']
+
+    def CollectIsForwardOnly(self):
+        fwd_api_contents = self.fwd_api_contents
+        self.is_forward_only = False if 'backward' in fwd_api_contents.keys(
+        ) else True
+
+    def CollectOptionalInputs(self):
+        fwd_api_contents = self.fwd_api_contents
+        if 'optional' in fwd_api_contents.keys():
+            self.optional_inputs = ParseDispensable(fwd_api_contents[
+                'optional'])
+
+    def CollectForwardInOutAttr(self):
+        forward_args_str = self.forward_args_str
+        forward_returns_str = self.forward_returns_str
+
+        self.forward_inputs_list, self.forward_attrs_list, self.forward_returns_list = ParseYamlForward(
+            forward_args_str, forward_returns_str)
+
+    def CollectForwardPositionMap(self):
+        forward_inputs_list = self.forward_inputs_list
+        forward_returns_list = self.forward_returns_list
+
+        self.forward_inputs_position_map, self.forward_outputs_position_map = DetermineForwardPositionMap(
+            forward_inputs_list, forward_returns_list)
+
+    def GeneratePythonCFunction(self):
+        namespace = self.namespace
+        forward_api_name = self.forward_api_name
+        forward_attrs_list = self.forward_attrs_list
+        forward_inputs_position_map = self.forward_inputs_position_map
+        forward_outputs_position_map = self.forward_outputs_position_map
+        optional_inputs = self.optional_inputs
+        is_forward_only = self.is_forward_only
+
+        # Generate Python-C Tensors Parsing Logic
+        get_eager_tensor_str = ""
+        for name, (ttype, pos) in forward_inputs_position_map.items():
+            is_optional = (name in optional_inputs)
+            if IsVectorTensorType(ttype):
+                get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format(
+                    name, "GetTensorListFromArgs", forward_api_name, name, pos)
+            else:
+                if is_optional:
+                    get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format(
+                        name, "GetOptionalTensorFromArgs", forward_api_name,
+                        name, pos)
+                else:
+                    get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format(
+                        name, "GetTensorFromArgs", forward_api_name, name, pos)
+
+        parse_attributes_str = ""
+
+        # Generate Python-C Attributes Parsing Logic
+        for name, atype, _, pos in forward_attrs_list:
+            parsing_function_name = FindParsingFunctionFromAttributeType(atype)
+            parse_attributes_str += PARSE_PYTHON_C_ARGS_TEMPLATE.format(
+                name, pos, atype, name, parsing_function_name, name,
+                forward_api_name, pos)
+
+        # Generate Dygraph Function Call Logic
+        num_args = len(forward_inputs_position_map.keys()) + len(
+            forward_attrs_list)
+        dygraph_function_call_list = ["" for i in range(num_args)]
+        for name, (_, pos) in forward_inputs_position_map.items():
+            dygraph_function_call_list[pos] = f"{name}"
+        for name, _, _, pos in forward_attrs_list:
+            dygraph_function_call_list[pos] = f"{name}"
+        dygraph_function_call_str = ",".join(dygraph_function_call_list)
+
+        # Generate Python-C Function Definitions
+        if is_forward_only:
+            fwd_function_name = FUNCTION_NAME_TEMPLATE.format(
+                "paddle::experimental::", namespace, forward_api_name)
+        else:
+            fwd_function_name = FUNCTION_NAME_TEMPLATE.format(
+                "::", namespace, GetForwardFunctionName(forward_api_name))
+
+        # Generate Record Event for performance profiling
+        pythonc_record_event_str = RECORD_EVENT_TEMPLATE.format(
+            "pythonc_record_event", forward_api_name, "pybind_imperative_func")
+        self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
+            forward_api_name, pythonc_record_event_str, forward_api_name,
+            get_eager_tensor_str, parse_attributes_str, fwd_function_name,
+            dygraph_function_call_str)
+
+        # Generate Python-C Function Registration
+        self.python_c_function_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format(
+            forward_api_name, namespace, forward_api_name, forward_api_name)
+
+    def run(self):
+        # Initialized is_forward_only
+        self.CollectIsForwardOnly()
+
+        # Initialized forward_api_name, forward_args_str, forward_returns_str
+        self.CollectRawContents()
+        if SkipAPIGeneration(self.forward_api_name): return False
+
+        # Initialized optional_inputs
+        self.CollectOptionalInputs()
+
+        # Initialized forward_inputs_list, forward_returns_list, forward_attrs_list
+        self.CollectForwardInOutAttr()
+        logging.info(
+            f"Parsed Original Forward Inputs List: \n{self.forward_inputs_list}")
+        logging.info(
+            f"Prased Original Forward Attrs List: \n{self.forward_attrs_list}")
+        logging.info(
+            f"Parsed Original Forward Returns List: \n{self.forward_returns_list}"
+        )
+
+        # Initialized forward_inputs_position_map, forward_outputs_position_map
+        self.CollectForwardPositionMap()
+        logging.info(
+            f"Generated Forward Input Position Map: {self.forward_inputs_position_map}"
+        )
+        logging.info(
+            f"Generated Forward Output Position Map: {self.forward_outputs_position_map}"
+        )
+
+        # Code Generation
+        self.GeneratePythonCFunction()
+        logging.info(
+            f"Generated Python-C Function: {self.python_c_function_str}")
+        logging.info(
+            f"Generated Python-C Function Declaration: {self.python_c_function_reg_str}"
+        )
+
+        return True
+
+
+class PythonCYamlGenerator:
+    def __init__(self, path):
+        self.yaml_path = path
+
+        self.namespace = ""
+        self.forward_api_list = []
+
+        # Generated Result
+        self.python_c_functions_reg_str = ""
+        self.python_c_functions_str = ""
+
+    def ParseYamlContents(self):
+        yaml_path = self.yaml_path
+        self.forward_api_list = ReadFwdFile(yaml_path)
+
+    def GeneratePythonCFunctions(self):
+        namespace = self.namespace
+        forward_api_list = self.forward_api_list
+
+        for forward_api_content in forward_api_list:
+            f_generator = PythonCSingleFunctionGenerator(forward_api_content,
+                                                         namespace)
+            status = f_generator.run()
+
+            if status == True:
+                self.python_c_functions_reg_str += f_generator.python_c_function_reg_str + ",\n"
+                self.python_c_functions_str += f_generator.python_c_function_str + "\n"
+
+    def InferNameSpace(self):
+        yaml_path = self.yaml_path
+        if "sparse" in yaml_path:
+            self.namespace = "sparse::"
+
+    def AttachNamespace(self):
+        namespace = self.namespace
+        python_c_functions_str = self.python_c_functions_str
+
+        if namespace != "":
+            if namespace.endswith("::"):
+                namespace = namespace[:-2]
+            self.python_c_functions_str = NAMESPACE_WRAPPER_TEMPLATE.format(
+                namespace, python_c_functions_str)
+
+    def run(self):
+        # Infer namespace from yaml_path
+        self.InferNameSpace()
+
+        # Read Yaml file
+        self.ParseYamlContents()
+
+        # Code Generation
+        self.GeneratePythonCFunctions()
+
+        # Wrap with namespace
+        self.AttachNamespace()
+
+
+############################
+## Code Generation Helper ##
+############################
+def ParseArguments():
+    parser = argparse.ArgumentParser(
+        description='Eager Code Generator Args Parser')
+    parser.add_argument('--api_yaml_path', type=str)
+    parser.add_argument('--output_path', type=str)
+
+    args = parser.parse_args()
+    return args
+
+
+def GenerateCoreOpsInfoMap():
+    return CORE_OPS_INFO, CORE_OPS_INFO_REGISTRY
 
 
 def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str):
@@ -213,33 +475,6 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str):
     python_c_function_reg_str += core_ops_infos_registry
     python_c_function_reg_str += "\n {nullptr,nullptr,0,nullptr}"
 
-    PYTHON_C_WRAPPER_TEMPLATE = """
-#pragma once
-
-#include  "pybind11/detail/common.h"
-#include  "paddle/phi/api/all.h"
-#include  "paddle/phi/common/backend.h"
-#include  "paddle/phi/common/data_type.h"
-#include  "paddle/phi/common/scalar.h"
-#include  "paddle/phi/common/scalar_array.h"
-#include  "paddle/fluid/pybind/op_function_common.h"
-#include  "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
-#include  "paddle/fluid/pybind/exception.h"
-#include  <Python.h>
-
-namespace paddle {{
-namespace pybind {{
-
-{}
-
-static PyMethodDef EagerFinalStateMethods[] = {{
-    {}
-}};
-
-}} // namespace pybind
-}} // namespace paddle
-
-"""
     python_c_str = PYTHON_C_WRAPPER_TEMPLATE.format(python_c_function_str,
                                                     python_c_function_reg_str)
 
@@ -253,63 +488,23 @@ def GeneratePythonCFile(filepath, python_c_str):
 
 if __name__ == "__main__":
     args = ParseArguments()
+    api_yaml_paths = args.api_yaml_path.split(",")
 
-    api_yaml_path = args.api_yaml_path
-    fwd_api_list = ReadFwdFile(api_yaml_path)
-
-    python_c_function_list = []
-    python_c_function_reg_list = []
-    for fwd_api in fwd_api_list:
-
-        # We only generate Ops with grad
-        is_forward_only = False
-        if 'backward' not in fwd_api.keys():
-            is_forward_only = True
-
-        assert 'api' in fwd_api.keys()
-        assert 'args' in fwd_api.keys()
-        assert 'output' in fwd_api.keys()
-
-        fwd_api_name = fwd_api['api']
-        fwd_args_str = fwd_api['args']
-        fwd_returns_str = fwd_api['output']
-
-        if fwd_api_name in skipped_fwd_api_names:
-            continue
-
-        # Parse Dispensable Inputs
-        optional_inputs = []
-        if 'optional' in fwd_api.keys():
-            optional_inputs = ParseDispensable(fwd_api['optional'])
-
-        # Collect Original Forward Inputs/Outputs and then perform validation checks
-        forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward(
-            fwd_args_str, fwd_returns_str)
-        print("Parsed Original Forward Inputs List: ", forward_inputs_list)
-        print("Prased Original Forward Attrs List: ", forward_attrs_list)
-        print("Parsed Original Forward Returns List: ", forward_returns_list)
-
-        forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
-            forward_inputs_list, forward_returns_list)
-        print("Generated Forward Input Position Map: ",
-              forward_inputs_position_map)
-        print("Generated Forward Output Position Map: ",
-              forward_outputs_position_map)
+    generated_python_c_functions = ""
+    generated_python_c_registration = ""
+    for i in range(len(api_yaml_paths)):
+        api_yaml_path = api_yaml_paths[i]
 
-        python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction(
-            fwd_api_name, forward_inputs_position_map, forward_attrs_list,
-            forward_outputs_position_map, optional_inputs, is_forward_only)
-        python_c_function_list.append(python_c_function_str)
-        python_c_function_reg_list.append(python_c_function_reg_str)
-        print("Generated Python-C Function: ", python_c_function_str)
+        y_generator = PythonCYamlGenerator(api_yaml_path)
+        y_generator.run()
 
-    python_c_functions_str = "\n".join(python_c_function_list)
-    python_c_functions_reg_str = ",\n".join(python_c_function_reg_list)
+        generated_python_c_functions += y_generator.python_c_functions_str + "\n"
+        generated_python_c_registration += y_generator.python_c_functions_reg_str + "\n"
 
-    python_c_str = GeneratePythonCWrappers(python_c_functions_str,
-                                           python_c_functions_reg_str)
+    python_c_str = GeneratePythonCWrappers(generated_python_c_functions,
+                                           generated_python_c_registration)
 
-    print("Generated Python-C Codes: ", python_c_str)
+    logging.info(f"Generated Python-C Codes: \n{python_c_str}")
 
     output_path = args.output_path
     for path in [output_path]:
diff --git a/paddle/fluid/eager/autograd_meta.h b/paddle/fluid/eager/autograd_meta.h
index 9e1dc4f2c8c6b..dca76d3b8a0db 100644
--- a/paddle/fluid/eager/autograd_meta.h
+++ b/paddle/fluid/eager/autograd_meta.h
@@ -145,8 +145,7 @@ class AutogradMeta : public AbstractAutogradMeta {
  private:
   // TODO(jiabin) :Should we use pointer instead of object?
   std::shared_ptr<paddle::experimental::Tensor> grad_{
-      std::make_shared<paddle::experimental::Tensor>(
-          egr::Controller::Instance().GenerateUniqueName("@grad"))};
+      std::make_shared<paddle::experimental::Tensor>()};
 
   // GradNodeBase is base class of all grad op which is a
   // wrapper for grad op. This class will make grad op easy
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 934497d7d179c..ebd3333c52659 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -19,6 +19,8 @@
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/grad_tensor_holder.h"
 #include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
@@ -27,6 +29,325 @@
 
 namespace egr {
 
+/*
+* GeneralGrad is Helpper class to implement custom grad operation between
+* outputs and inputs.
+*
+* **/
+class GeneralGrad {
+ public:
+  static GeneralGrad& Instance() { return *general_grad_; }
+
+  // Get inputs's / no_grad_vars's GradNodes and InputMeta Info
+  void GetTargetNodesInfo(
+      const std::vector<paddle::experimental::Tensor>& inputs,
+      bool is_no_grad_vars) {
+    std::string msg = is_no_grad_vars ? "no_grad_vars" : "inputs";
+    VLOG(6) << "Running in GetTargetNodesInfo.";
+    if (!inputs.empty()) {
+      VLOG(6) << msg << " are not empty.";
+      size_t num_inputs = inputs.size();
+      for (size_t i = 0; i < num_inputs; i++) {
+        AutogradMeta* auto_grad_meta =
+            EagerUtils::unsafe_autograd_meta(inputs[i]);
+        auto target_node = auto_grad_meta->GetMutableGradNode().get();
+        PADDLE_ENFORCE_NOT_NULL(target_node,
+                                paddle::platform::errors::Fatal(
+                                    "There is no grad op for %s:[%d] or it's"
+                                    "stop_gradient=True.",
+                                    msg, i));
+        if (is_no_grad_vars) {
+          (no_grad_var_nodes_inputmeta_map)[target_node] = auto_grad_meta;
+        } else {  // normal input
+          (input_target_nodes_inputmeta_map)[target_node] = auto_grad_meta;
+        }
+      }
+    }
+  }
+
+  // Purify potential_startup_nodes, remove nodes those are the same as
+  // input_target_nodes
+  void PurifyPotentialStartUpNodes() {
+    VLOG(6) << "Running in PurifyPotentialStartUpNodes";
+    if (input_target_nodes_inputmeta_map.empty()) return;
+    std::unordered_set<GradNodeBase*> potential_startup_nodes_to_be_erased;
+    for (auto startup_op : potential_startup_nodes) {
+      auto iter = input_target_nodes_inputmeta_map.find(startup_op);
+      if (iter != input_target_nodes_inputmeta_map.end()) {
+        potential_startup_nodes_to_be_erased.emplace(iter->first);
+      }
+    }
+    if (!potential_startup_nodes_to_be_erased.empty()) {
+      for (auto nodes : potential_startup_nodes_to_be_erased) {
+        potential_startup_nodes.erase(nodes);
+      }
+    }
+  }
+
+  // Remove some nodes those doesn't need to be
+  // stored in potential_stop_nodes、potential_startup_nodes
+  void UpdateGraphInfo() {
+    // Updated potential_sotp_nodes by depending_nodes,
+    // make sure the path from root to target_node is ok
+    std::unordered_set<GradNodeBase*> _startup_ops;
+    VLOG(6) << "Running in UpdateGraphInfo";
+    std::queue<GradNodeBase*> queue;
+    for (auto& target_nodes_inputmeta_pair : input_target_nodes_inputmeta_map) {
+      queue.emplace(target_nodes_inputmeta_pair.first);
+    }
+
+    while (!queue.empty()) {
+      auto* target_node = queue.front();
+      queue.pop();
+      if (!(depending_nodes)[target_node].empty()) {
+        auto precedding_nodes = (depending_nodes)[target_node];
+        for (auto pre_nodes : precedding_nodes) {
+          queue.emplace(pre_nodes);
+          if (potential_stop_nodes.find(pre_nodes) !=
+              potential_stop_nodes.end()) {
+            potential_stop_nodes.erase(pre_nodes);
+          }
+        }
+      } else {  // startup_ops have no precedding nodes
+        VLOG(6) << "Emplace _startup_ops";
+        _startup_ops.emplace(target_node);
+      }
+    }
+    // Purify potential_startup_nodes again, remove some
+    // potential startup_nodes that unreach to input target nodes
+    if (!_startup_ops.empty()) {
+      std::unordered_set<GradNodeBase*> potential_startup_nodes_to_be_erased;
+      for (auto node : potential_startup_nodes) {
+        if (_startup_ops.count(node) == 0) {
+          VLOG(6) << "Set up potential_startup_nodes_to_be_erased";
+          potential_startup_nodes_to_be_erased.emplace(node);
+        }
+      }
+      if (!potential_startup_nodes_to_be_erased.empty()) {
+        for (auto node : potential_startup_nodes_to_be_erased) {
+          VLOG(6) << "Erase nodes in potential_startup_nodes_to_be_erased";
+          potential_startup_nodes.erase(node);
+        }
+      }
+    }
+  }
+
+  // Get Graph Info Betweent input target GradNode and outputs，
+  // record depending_nodes、potential_stop_nodes、potential_startup_nodes
+  void GetGraphInfoBetweenTargets(const std::queue<GradNodeBase*>& init_queue) {
+    VLOG(6) << "Runing In GetGraphInfoBetweenTargets";
+
+    // Calculate in_degree for each node
+    std::unordered_map<GradNodeBase*, int> node_in_degree_map;
+
+    // Copy nodes
+    std::queue<GradNodeBase*> queue = init_queue;
+    std::unordered_set<GradNodeBase*> visited;
+
+    // Visit each node exactly once in any order
+    while (!queue.empty()) {
+      GradNodeBase* node = queue.front();
+      queue.pop();
+
+      if (visited.count(node)) {
+        continue;
+      }
+      visited.insert(node);
+
+      // Check node is target_nodes or not, if node is not target_node,
+      // all the next_node will be marked in potential_stop_nodes
+      bool is_potential_stop_nodes =
+          input_target_nodes_inputmeta_map.count(node);
+
+      // Find and append next nodes
+      const std::vector<std::vector<Edge>>& edges = node->GetEdges();
+      for (const auto& edge_list : edges) {
+        for (const Edge& edge : edge_list) {
+          GradNodeBase* next_node = edge.GetMutableGradNode().get();
+
+          // Next node could be nullptr if it is leaf tensor with no
+          // AccumulationNode attached
+          // Or it could also originated from dispensable inputs
+          if (!next_node) continue;
+
+          // if node not in input_target_nodes,
+          // all the next_nodes of current node will be inserted to
+          // potential_stop_node
+          if (is_potential_stop_nodes) {
+            potential_stop_nodes.emplace(next_node);
+          }
+
+          // Update in_degree
+          if (!node_in_degree_map.count(next_node))
+            node_in_degree_map[next_node] = 0;
+          node_in_degree_map[next_node]++;
+
+          // Record depending relationship
+          (depending_nodes)[next_node].emplace(node);
+          queue.push(next_node);
+        }
+      }
+    }
+    // Update Graph Info, remove some nodes in
+    // potential_stop_nodes、potential_startup_nodes、
+    UpdateGraphInfo();
+  }
+
+  void ModifyReadyQueue(std::queue<GradNodeBase*>* queue) {
+    std::queue<GradNodeBase*> tmp_queue;
+    for (auto nodes : potential_startup_nodes) {
+      tmp_queue.emplace(nodes);
+    }
+    tmp_queue.swap(*queue);
+  }
+
+  // Set result for input target grad_var when potential_startup_nodes is empty
+  void SetResultForInputTargetVar(
+      const std::unordered_map<GradNodeBase*,
+                               std::unique_ptr<GradTensorHolder>>&
+          node_input_buffers_dict) {
+    if (potential_startup_nodes.size() == 0) {
+      for (auto input_target_node : *GetInPutTargetNodesInputMetaMap()) {
+        // out rank_info of forward op
+        auto rank_info = input_target_node.second->OutRankInfo();
+        auto iter = node_input_buffers_dict.find(input_target_node.first);
+        if (iter != node_input_buffers_dict.end()) {
+          auto& target_result =
+              (iter->second)->Buffers()[rank_info.first][rank_info.second];
+          // save the target result
+          results_map[input_target_node.first] = target_result;
+        }
+      }
+    }
+  }
+
+  // Set input target grad_var from node_input_buffer by inputmeta
+  void SetResultForInputTargetVar(GradTensorHolder input_buffers,
+                                  GradNodeBase* node) {
+    auto iter = GetInPutTargetNodesInputMetaMap()->find(node);
+    if (iter != GetInPutTargetNodesInputMetaMap()->end()) {
+      VLOG(6) << "Get target result by by inputmeta";
+      // out rank_info of forward op
+      auto rank_info = (iter->second)->OutRankInfo();
+      // rank_info is a pair, first means slot_id, second means rank.
+      auto& target_result =
+          input_buffers.Buffers()[rank_info.first][rank_info.second];
+      // save the target result
+      results_map[node] = target_result;
+    }
+  }
+
+  std::vector<paddle::experimental::Tensor> GetResults(
+      const std::vector<paddle::experimental::Tensor>& inputs,
+      bool allow_unused, bool create_graph) {
+    VLOG(6) << "Running in GetResults";
+    if (inputs.empty()) return {};
+
+    std::vector<paddle::experimental::Tensor> results;
+    results.reserve(inputs.size());
+
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      auto& input = inputs[i];
+      AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(input);
+      auto target_node = auto_grad_meta->GetMutableGradNode().get();
+
+      auto iter = results_map.find(target_node);
+      if (iter != results_map.end()) {
+        // set StopGradient = !create_graph
+        AutogradMeta* tensor_auto_grad_meta =
+            EagerUtils::autograd_meta(&(iter->second));
+        tensor_auto_grad_meta->SetStopGradient(!create_graph);
+        results.emplace_back(iter->second);
+      } else {
+        PADDLE_ENFORCE_EQ(allow_unused, true,
+                          paddle::platform::errors::InvalidArgument(
+                              "The %d-th input does not appear in the backward "
+                              "graph. Please check the input tensor or set "
+                              "allow_unused=True to get None result.",
+                              i));
+        results.emplace_back();
+      }
+    }
+    Clear();
+    return results;
+  }
+
+  void PreparedForGeneralGrad(
+      const std::vector<paddle::experimental::Tensor>& inputs,
+      const std::vector<paddle::experimental::Tensor>& no_grad_vars,
+      std::queue<GradNodeBase*>* queue,
+      const std::unordered_map<GradNodeBase*,
+                               std::unique_ptr<GradTensorHolder>>&
+          node_input_buffers_dict) {
+    // Get no_grad_vars's GradNodes and InputMeta Info
+    GetTargetNodesInfo(no_grad_vars, true /* is_no_grad_vars */);
+    // Get inputs's GradNodes and InputMeta Info
+    GetTargetNodesInfo(inputs, false /* is_no_grad_vars */);
+    // Purify potential_startup_ops, remove those nodes that are the same as
+    // input_target_nodes
+    PurifyPotentialStartUpNodes();
+    // Get Graph Info Betweent input target gradnode and outputs
+    // Record the depending_nodes and
+    // potential_stop_nodes、potential_startup_nodes
+    GetGraphInfoBetweenTargets(*queue);
+    // Reset queue. Queue is empty only when
+    // 1.input equals to output. 2.input can not reach to output.
+    ModifyReadyQueue(queue);
+    // Set result for input target grad_var when queue is empty
+    if (queue->empty()) SetResultForInputTargetVar(node_input_buffers_dict);
+  }
+
+  bool IsPotentialStopNodes(GradNodeBase* node) {
+    return potential_stop_nodes.count(node);
+  }
+
+  std::unordered_map<GradNodeBase*, AutogradMeta*>*
+  GetNoGradVarNodesInputMetaMap() {
+    return &no_grad_var_nodes_inputmeta_map;
+  }
+
+  std::unordered_map<GradNodeBase*, AutogradMeta*>*
+  GetInPutTargetNodesInputMetaMap() {
+    return &input_target_nodes_inputmeta_map;
+  }
+
+  std::unordered_set<GradNodeBase*>* GetPotentialStopNodes() {
+    return &potential_stop_nodes;
+  }
+
+  std::unordered_set<GradNodeBase*>* GetPotentialStartupNodes() {
+    return &potential_startup_nodes;
+  }
+
+  void Clear() {
+    no_grad_var_nodes_inputmeta_map.clear();
+    input_target_nodes_inputmeta_map.clear();
+    potential_startup_nodes.clear();
+    potential_stop_nodes.clear();
+    depending_nodes.clear();
+    results_map.clear();
+  }
+
+ private:
+  GeneralGrad() = default;
+  static GeneralGrad* general_grad_;
+  // no_grad_vars's GradNode and GradNode's InputMeta.
+  std::unordered_map<GradNodeBase*, AutogradMeta* /* InputMeta */>
+      no_grad_var_nodes_inputmeta_map;
+  // inputs's GradNode and GradNode's InputMeta.
+  std::unordered_map<GradNodeBase*, AutogradMeta* /* InputMeta */>
+      input_target_nodes_inputmeta_map;
+  // Record all the potential startup_nodes, will be changed.
+  std::unordered_set<GradNodeBase*> potential_startup_nodes;
+  // Record all the potential stop nodes, will be changed.
+  std::unordered_set<GradNodeBase*> potential_stop_nodes;
+  std::unordered_map<GradNodeBase* /* next node */,
+                     std::unordered_set<GradNodeBase*> /* pre nodes */>
+      depending_nodes;
+  std::unordered_map<GradNodeBase*, paddle::experimental::Tensor> results_map;
+  DISABLE_COPY_AND_ASSIGN(GeneralGrad);
+};
+
 std::unordered_map<GradNodeBase*, int> getInDegreeMap(
     const std::queue<GradNodeBase*>& init_queue) {
   // Calculate in_degree for each node
@@ -74,14 +395,51 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
   return node_in_degree_map;
 }
 
-void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
-                 const std::vector<paddle::experimental::Tensor>& grad_tensors,
-                 bool retain_graph) {
+// Enforce GradNode has TensorWrappers as Input
+void EnforceGradNodeHasInput(GradNodeBase* node) {
+  VLOG(6) << "Running in EnforceGradNodeHasInput";
+  PADDLE_ENFORCE_NE(
+      node->IsTensorWrappersCleared(), true,
+      paddle::platform::errors::Fatal(
+          "The TensorWrappers of %s do not exist. This may be because:\n"
+          "You calculate backward twice for the same subgraph without "
+          "setting retain_graph=True. Please set retain_graph=True in the "
+          "first backward/grad call.\n",
+          node->name()));
+}
+
+void DuplicateCheck(const std::vector<paddle::experimental::Tensor>& inputs,
+                    bool is_input) {
+  std::unordered_set<AutogradMeta*> visisted_ins;
+  std::string msg = is_input ? "inputs" : "outputs";
+  for (auto in : inputs) {
+    AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(in);
+    PADDLE_ENFORCE_EQ(
+        visisted_ins.count(auto_grad_meta), 0,
+        paddle::platform::errors::AlreadyExists(
+            "%s contain duplicate tensor %s, please check %s carefully.", msg,
+            in.name(), msg));
+    visisted_ins.insert(auto_grad_meta);
+  }
+}
+
+GeneralGrad* GeneralGrad::general_grad_ = new GeneralGrad();
+
+std::vector<paddle::experimental::Tensor> RunBackward(
+    const std::vector<paddle::experimental::Tensor>& tensors,  // output
+    const std::vector<paddle::experimental::Tensor>& grad_tensors,
+    bool retain_graph, bool create_graph = false,
+    const std::vector<paddle::experimental::Tensor>& inputs = {},
+    bool allow_unused = false,
+    const std::vector<paddle::experimental::Tensor>& no_grad_vars = {}) {
   VLOG(6) << "Start Backward";
   // *Gradient Hook should happen at node-level
   // *Inplace version check should perform at node-level
   // *Cross-batch accumulation happens at forward pass
 
+  // GeneralGrad
+  bool is_general_grad = !inputs.empty();
+
   /* --- Initialization --- */
   // 1. Init queue with starting nodes
   // 2. Prepare initial input buffers
@@ -112,7 +470,8 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
 
     // Prepare GradTensorHolder
     if (!node_input_buffers_dict.count(grad_node)) {
-      VLOG(6) << "Create Value for grad input tensor " << i;
+      VLOG(6) << "Create Value for grad input tensor " << i
+              << " of grad node: " << grad_node->name();
       node_input_buffers_dict[grad_node] =
           std::make_unique<GradTensorHolder>(grad_node->InputMeta());
     }
@@ -123,11 +482,20 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
           paddle::platform::errors::Fatal(
               "Detected size mismatch between tensors and grad_tensors"
               "grad_tensors should either have "
-              "size = 0 or same size as tensors"));
+              "size = 0 or same size as tensors."));
       // Feed given tensor if it's provided
       VLOG(6) << "Fill grad input tensor " << i << "with give grad tensor";
-      node_input_buffers_dict[grad_node]->add(
-          input_info.first, input_info.second, grad_tensors[i]);
+
+      if (grad_tensors[i].is_initialized()) {
+        // Deep copy
+        paddle::experimental::Tensor tmp_tensor;
+        tmp_tensor.copy_(grad_tensors[i], grad_tensors[i].inner_place(), true);
+        node_input_buffers_dict[grad_node]->add(input_info.first,
+                                                input_info.second, tmp_tensor);
+      } else {
+        node_input_buffers_dict[grad_node]->add(
+            input_info.first, input_info.second, grad_tensors[i]);
+      }
 
     } else {
       VLOG(6) << "Fill grad input tensor " << i << " with 1.0";
@@ -140,8 +508,11 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
           input_info.first, input_info.second, tensor, true /*fill_one=true*/);
     }
 
-    // Prepare queue
+    // Prepare queue, potential startup_nodes
     queue.push(grad_node);
+    if (is_general_grad) {
+      GeneralGrad::Instance().GetPotentialStartupNodes()->emplace(grad_node);
+    }
   }
 
   VLOG(6) << "Update In degree Map for backward";
@@ -149,37 +520,88 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
   std::unordered_map<GradNodeBase*, int> node_in_degree_map =
       getInDegreeMap(queue);
 
+  if (is_general_grad) {
+    // Prepare several vital preprocess for GeneralGrad
+    GeneralGrad::Instance().PreparedForGeneralGrad(inputs, no_grad_vars, &queue,
+                                                   node_input_buffers_dict);
+  }
+
+  VLOG(6) << " startup_ops' size is :" << queue.size();
+
   /* --- Topological Visit --- */
   // 1. Pop queue
   // 2. Run node
+  //    |- Check and capture target result
   //    |- node(grads)
   //    |- Prepare for next node
   // 3. Update queue
   VLOG(6) << "Run Backward";
   while (!queue.empty()) {
     GradNodeBase* node = queue.front();
+    VLOG(6) << "Running GradNode:" << node->name();
+
+    paddle::platform::RecordEvent node_record_event(
+        std::string(typeid(*node).name()) + " grad_node",
+        paddle::platform::TracerEventType::Operator, 1);
+
+    if (queue.size() > 1 && node_in_degree_map[node] != 0) {
+      queue.pop();
+      continue;
+    }
     queue.pop();
 
     // Run node: This is where Hook happens
     PADDLE_ENFORCE(
         node_input_buffers_dict.count(node),
         paddle::platform::errors::Fatal(
-            "Unable to find next node in the InputBuufer"
-            "Trying to run Node without configuring its GradTensorHolder"));
+            "Unable to find next node in the GradTensorHolder \n"
+            "Trying to run Node without configuring its GradTensorHolder."));
 
     std::unique_ptr<GradTensorHolder> node_input_buffer =
         std::move(node_input_buffers_dict[node]);
 
-    VLOG(6) << "Run Backward Kernel with input_buffer";
+    // Set input target grad_var from node_input_buffer by inputmeta
+    if (!inputs.empty() && is_general_grad) {
+      GeneralGrad::Instance().SetResultForInputTargetVar(*node_input_buffer,
+                                                         node);
+    }
+
+    // no_grad_vars
+    if (!no_grad_vars.empty() && is_general_grad) {
+      auto iter =
+          GeneralGrad::Instance().GetNoGradVarNodesInputMetaMap()->find(node);
+      if (iter !=
+          GeneralGrad::Instance().GetNoGradVarNodesInputMetaMap()->end()) {
+        VLOG(6) << "Change the input buffer[slot][rank] by Zeros";
+        auto rank_info = (iter->second)->OutRankInfo();
+        node_input_buffer->SetBufferSlotRankZeros(rank_info.first,
+                                                  rank_info.second);
+      }
+    }
+
+    VLOG(6) << "Running GradNode:" << node->name();
+
+    // Check input
+    EnforceGradNodeHasInput(node);
+
+    VLOG(6) << "Run Backward Kernel with GradTensorHolder.";
     // Run Pre Backward Node and get outputs
     std::vector<std::vector<paddle::experimental::Tensor>> grad_output_tensors =
-        (*node)(node_input_buffer->Buffers());
+        (*node)(node_input_buffer->Buffers(), create_graph);
+
+    // retain_grad or not
+    if (!retain_graph) {
+      VLOG(6)
+          << "retain_graph is false, need to clear the TensorWrapper of nodes.";
+      node->ClearTensorWrappers();
+    }
+
     // TODO(jiabin): Should we erase it or find a more efficient way.
+
     node_input_buffers_dict.erase(node);
 
     // Prepare GradTensorHolder for next node
     const std::vector<std::vector<Edge>>& edges = node->GetEdges();
-
     PADDLE_ENFORCE(edges.size() == grad_output_tensors.size() || edges.empty(),
                    paddle::platform::errors::Fatal(
                        "Number of edges should be either empty ( for leaf node "
@@ -190,6 +612,7 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
     for (size_t i = 0; i < edges.size(); i++) {
       for (size_t j = 0; j < edges[i].size(); j++) {
         const Edge& edge = edges[i][j];
+
         auto edge_rank = edge.GetEdgeRankInfo();
         // Since we make edge has as same rank as bwd outputs, we indexing them
         // with
@@ -203,6 +626,7 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
             grad_output_tensors[i].empty()) {
           continue;
         }
+
         PADDLE_ENFORCE_LT(
             j, grad_output_tensors[i].size(),
             paddle::platform::errors::Fatal(
@@ -215,9 +639,8 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
 
         if ((!grad_output_tensor.defined() ||
              !grad_output_tensor.initialized())) {
-          VLOG(6)
-              << "We get grad_output_tensor with slot: " << i << ", rank: " << j
-              << " as uninitialized or undefined in both tensor and variable";
+          VLOG(6) << "We get grad_output_tensor with slot: " << i
+                  << ", rank: " << j << " as uninitialized or undefined tensor";
         }
         VLOG(6) << "Get Edge and grad_output_tensor with slot: " << i
                 << ", rank: " << j
@@ -228,6 +651,8 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
           const auto& input_meta = next_node->InputMeta();
           auto grad_tensor_holder =
               std::make_unique<GradTensorHolder>(input_meta);
+          VLOG(6) << "Construct GradTensorHolder for grad node: "
+                  << next_node->name();
           node_input_buffers_dict[next_node] = std::move(grad_tensor_holder);
         }
         VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first
@@ -237,16 +662,54 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
 
         // Update queue
         node_in_degree_map[next_node]--;
-        PADDLE_ENFORCE(node_in_degree_map[next_node] >= 0,
-                       paddle::platform::errors::Fatal(
-                           "Detected in-degree value smaller than zero."
-                           "Node's in-degree cannot be negative"));
-        if (node_in_degree_map[next_node] == 0) {
-          queue.emplace(std::move(next_node));
+
+        PADDLE_ENFORCE(
+            node_in_degree_map[next_node] >= 0,
+            paddle::platform::errors::Fatal(
+                "Detected in-degree value smaller than zero. For Node: %s"
+                "Node's in-degree cannot be negative.",
+                next_node->name()));
+
+        if (is_general_grad) {
+          bool is_potential_stop_node =
+              GeneralGrad::Instance().GetPotentialStopNodes()->count(next_node);
+          if (node_in_degree_map[next_node] == 0 && !is_potential_stop_node) {
+            queue.emplace(std::move(next_node));
+          }
+        } else {
+          if (node_in_degree_map[next_node] == 0) {
+            queue.emplace(std::move(next_node));
+          }
         }
       }
     }
   }
+  if (!is_general_grad) return {};
+  return GeneralGrad::Instance().GetResults(inputs, allow_unused, create_graph);
+}
+
+void Backward(
+    const std::vector<paddle::experimental::Tensor>& tensors,  // outputs
+    const std::vector<paddle::experimental::Tensor>& grad_tensors,
+    bool retain_graph) {
+  VLOG(6) << "Run in Backward";
+  paddle::platform::RecordEvent backward_record_event(
+      "backward", paddle::platform::TracerEventType::Operator, 1);
+  RunBackward(tensors, grad_tensors, retain_graph);
 }
 
+std::vector<paddle::experimental::Tensor> Grad(
+    const std::vector<paddle::experimental::Tensor>& tensors,  // outputs
+    const std::vector<paddle::experimental::Tensor>& inputs,
+    const std::vector<paddle::experimental::Tensor>& grad_tensors,
+    bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused,
+    const std::vector<paddle::experimental::Tensor>& no_grad_vars) {
+  VLOG(6) << "Run in Grad";
+
+  DuplicateCheck(inputs, true /* is_input */);
+  DuplicateCheck(tensors, false /* is_input */);
+
+  return RunBackward(tensors, grad_tensors, retain_graph, create_graph, inputs,
+                     allow_unused, no_grad_vars);
+}
 }  // namespace egr
diff --git a/paddle/fluid/eager/backward.h b/paddle/fluid/eager/backward.h
index 2856d9fb87f34..bebe664838e6c 100644
--- a/paddle/fluid/eager/backward.h
+++ b/paddle/fluid/eager/backward.h
@@ -19,12 +19,20 @@
 
 namespace egr {
 
-// run_backward():
+// Backward():
 // tensors corresponds to those lived in the backward graph
 // each grad_tensors[i] keeps the value for its corresponding tensors[i]
-void RunBackward(const std::vector<paddle::experimental::Tensor> &tensors,
-                 const std::vector<paddle::experimental::Tensor> &grad_tensors,
-                 bool retain_graph = false);
+void Backward(const std::vector<paddle::experimental::Tensor>& tensors,
+              const std::vector<paddle::experimental::Tensor>& grad_tensors,
+              bool retain_graph = false);
+
+std::vector<paddle::experimental::Tensor> Grad(
+    const std::vector<paddle::experimental::Tensor>& tensors,
+    const std::vector<paddle::experimental::Tensor>& inputs,
+    const std::vector<paddle::experimental::Tensor>& grad_tensors = {},
+    bool retain_graph = false, bool create_graph = false,
+    bool only_inputs = false, bool allow_unused = false,
+    const std::vector<paddle::experimental::Tensor>& no_grad_vars = {});
 
 // Reserved for gradient()
 
diff --git a/paddle/fluid/eager/custom_operator/CMakeLists.txt b/paddle/fluid/eager/custom_operator/CMakeLists.txt
new file mode 100644
index 0000000000000..ccc9a03a55660
--- /dev/null
+++ b/paddle/fluid/eager/custom_operator/CMakeLists.txt
@@ -0,0 +1 @@
+cc_library(custom_operator_node SRCS custom_operator_node.cc DEPS phi_tensor phi_api grad_node_info custom_operator op_meta_info)
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
new file mode 100644
index 0000000000000..72af1cc4b0686
--- /dev/null
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/eager/custom_operator/custom_operator_node.h"
+#include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/fluid/framework/op_meta_info_helper.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace egr {
+std::vector<std::vector<paddle::experimental::Tensor>> RunCustomOpNode::
+operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+           bool create_graph) {
+  paddle::CustomOpKernelContext ctx;
+  auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs(
+      egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
+  auto grad_outputs_names = paddle::framework::OpMetaInfoHelper::GetOutputs(
+      egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
+  auto map = egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_);
+  auto kernel_map = egr::Controller::Instance().GetOpMetaInfoMap();
+
+  std::vector<std::vector<paddle::experimental::Tensor>> tmp_ins(
+      grad_inputs_name.size());
+  VLOG(7) << " Prepare Backward inputs of grads with size: " << grads.size()
+          << ", whose grad_inputs_name size is: " << grad_inputs_name.size();
+  for (size_t i = 0; i < grads.size(); i++) {
+    if (map[1].find(i) != map[1].end()) {
+      VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[1][i];
+      tmp_ins[map[1][i]] = grads[i];
+    }
+  }
+
+  for (auto it : fwd_outs) {
+    VLOG(7) << "Insert fwd_outs to grad_inputs: " << it.first;
+    tmp_ins[it.first] = RunCustomOpNode::Recover(&(it.second));
+  }
+
+  for (auto it : fwd_ins) {
+    VLOG(7) << "Insert fwd_ins to grad_inputs: " << it.first;
+    tmp_ins[it.first] = RunCustomOpNode::Recover(&(it.second));
+  }
+
+  VLOG(6) << "Prepare Grad inputs";
+  for (const auto& in : tmp_ins) {
+    ctx.EmplaceBackInputs(in);
+  }
+  VLOG(6) << "Prepare Grad attrs";
+  ctx.EmplaceBackAttrs(attrs_);
+  std::vector<std::vector<paddle::experimental::Tensor>> outs(
+      GetEdges().size());
+  std::vector<std::vector<paddle::experimental::Tensor>> tmp_outs(
+      grad_outputs_names.size());
+  VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size();
+  for (size_t i = 0; i < GetEdges().size(); i++) {
+    if (map[0].find(i) != map[0].end()) {
+      VLOG(7) << "Insert grad outputs: " << i
+              << " with size: " << GetEdges()[i].size()
+              << " to tmp_outputs: " << map[0][i];
+      for (size_t j = 0; j < GetEdges()[i].size(); j++) {
+        outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */
+                             std::make_shared<phi::DenseTensor>(
+                                 phi::DataType::UNDEFINED),
+                             egr::Controller::Instance().GenerateUniqueName(
+                                 "custom_tmp_grad"));
+      }
+      tmp_outs[map[0][i]] = outs[i];
+    }
+  }
+  for (size_t i = 0; i < tmp_outs.size(); i++) {
+    VLOG(7) << "Prepare grad outputs size: " << tmp_outs[i].size();
+    ctx.EmplaceBackOutputs(tmp_outs[i]);
+  }
+  VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_;
+
+  (*paddle::framework::OpMetaInfoHelper::GetKernelFn(
+      kernel_map.at(op_type_)[1]))(&ctx);
+  return outs;
+}
+}  // namespace egr
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h
new file mode 100644
index 0000000000000..6ece2658575c7
--- /dev/null
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/hooks.h"
+#include "paddle/fluid/eager/tensor_wrapper.h"
+#include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/utils/any.h"
+
+namespace egr {
+class RunCustomOpNode : public GradNodeBase {
+ public:
+  // Constructor: configure fwd input tensors to grad node
+  explicit RunCustomOpNode(size_t bwd_in_slot_num, size_t bwd_out_slot_num,
+                           const std::string& op_type)
+      : GradNodeBase(bwd_in_slot_num, bwd_out_slot_num), op_type_(op_type) {
+    VLOG(6) << "Construct RunCustomOpNode for op: " << op_type;
+  }
+
+  ~RunCustomOpNode() override {
+    VLOG(6) << "Destruct RunCustomOpNode for op: " << op_type_;
+  }
+
+  // Functor: perform backward computations
+  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
+      const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      bool create_graph) override;
+
+  std::string name() {
+    return paddle::string::Sprintf("RunCustomOpNode: %s_grad", op_type_);
+  }
+
+  static std::vector<egr::TensorWrapper> ConstructTensorWrapper(
+      const std::vector<paddle::experimental::Tensor>& fwd_var) {
+    std::vector<egr::TensorWrapper> res;
+    for (auto const& var : fwd_var) {
+      res.emplace_back(var);
+    }
+    return res;
+  }
+
+  static std::vector<paddle::experimental::Tensor> Recover(
+      std::vector<egr::TensorWrapper>* fwd_var) {
+    std::vector<paddle::experimental::Tensor> res;
+    for (size_t i = 0; i < fwd_var->size(); i++) {
+      res.emplace_back(fwd_var->at(i).recover(nullptr));
+    }
+    return res;
+  }
+
+  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+  bool IsTensorWrappersCleared() override {
+    VLOG(6) << "Do nothing here now";
+    return false;
+  }
+
+  void SetAttrs(const std::vector<paddle::any>& attr) { attrs_ = attr; }
+
+ public:
+  std::unordered_map<int, std::vector<egr::TensorWrapper>> fwd_outs;
+  std::unordered_map<int, std::vector<egr::TensorWrapper>> fwd_ins;
+  std::unordered_map<int, int> grads2grad_in_map;
+
+ private:
+  std::vector<paddle::any> attrs_;
+  std::string op_type_{""};
+};
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 427be83c3bbee..891ad4d8983b5 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -15,17 +15,23 @@
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/utils.h"
+
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/var_type.h"
+
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 
 #include "glog/logging.h"
 
 /**
- * Implementation of GradNodeBase, Edge and InputBuffer.
+ * Implementation of GradNodeBase, Edge and GradTensorHolder.
 **/
 namespace egr {
 
@@ -33,7 +39,6 @@ GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) {
   VLOG(6) << "Construct GradNodeBase";
   bwd_in_meta_.resize(bwd_in_slot_num);
   bwd_out_meta_.resize(bwd_out_slot_num);
-  // adj_edges has the same num as backward outputs
   adj_edges_.resize(bwd_out_slot_num);
 }
 
@@ -44,24 +49,20 @@ void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
           "Given slot id is out of range of adj_edges outter size, "
           "adj_edges is designed to has the same size of grad "
           "inputs's slot num."));
-  for (const auto& meta : *metas) {
+
+  for (size_t i = 0; i < metas->size(); i++) {
+    const auto& meta = (*metas)[i];
     // adj_edges has as same rank as fwd inputs, and record it's output rank
     // from
     // its pre-ops
     if (meta && !meta->StopGradient()) {
       auto node = meta->GetMutableGradNode();
-      if (node && node.get()) {
-        VLOG(6) << "Add Edges for slot: " << slot_id
-                << " which is: " << meta->GetMutableGradNode()->name();
-        adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
-                                         meta->OutRankInfo());
-      } else {
+      if (!node || !node.get()) {
         meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
-        VLOG(6) << "Add Edges for slot: " << slot_id
-                << " which is: " << meta->GetMutableGradNode()->name();
-        adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
-                                         meta->OutRankInfo());
       }
+
+      adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
+                                       meta->OutRankInfo());
     }
   }
 }
@@ -73,130 +74,205 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
           "Given slot id is out of range of adj_edges outter size, "
           "adj_edges is designed to has the same size of grad "
           "inputs's slot num."));
+
   if (meta && !meta->StopGradient()) {
     auto node = meta->GetMutableGradNode();
-    if (node && node.get()) {
-      VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
-              << this->name() << " to " << meta->GetMutableGradNode()->name();
-      adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
-                                       meta->OutRankInfo());
-    } else {
+    if (!node || !node.get()) {
       meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
-      VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
-              << this->name() << " to " << meta->GetMutableGradNode()->name();
-      adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
-                                       meta->OutRankInfo());
     }
+    VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
+            << this->name() << " to " << meta->GetMutableGradNode()->name();
+
+    adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
+                                     meta->OutRankInfo());
   }
 }
 
-const std::vector<GradSlotMeta>& GradNodeBase::InputMeta() const {
+const std::vector<std::vector<GradSlotMeta>>& GradNodeBase::InputMeta() const {
   return bwd_in_meta_;
 }
 
-const std::vector<GradSlotMeta>& GradNodeBase::OutputMeta() const {
+const std::vector<std::vector<GradSlotMeta>>& GradNodeBase::OutputMeta() const {
   return bwd_out_meta_;
 }
 
-void GradNodeBase::SetGradInMeta(std::vector<AutogradMeta*>* fwd_out,
+void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
                                  size_t slot_rank) {
-  size_t slot_size = fwd_out->size();
+  auto* fwd_out_meta = egr::EagerUtils::nullable_autograd_meta(fwd_out);
   PADDLE_ENFORCE_LE(
       slot_rank, (bwd_in_meta_.size() - 1),
       paddle::platform::errors::InvalidArgument(
           "Slot Rank should less equal than bwd_in_meta_ size, since "
           "bwd_in_meta_ is designed to hold as same num as backward "
           "inputs."));
-  auto& meta = bwd_in_meta_.at(slot_rank);
-  PADDLE_ENFORCE_EQ(meta.IsInitialized(), false,
-                    paddle::platform::errors::PreconditionNotMet(
-                        "Bwd_in_meta should only be init once, addition "
-                        "initialization for it is forbidden. If you got this "
-                        "error, it indicates bugs in framework."));
-  // Init stop gradient vector before use to avoid push back
-  meta.Init(slot_size);
-  for (size_t i = 0; i < slot_size; i++) {
-    PADDLE_ENFORCE_NOT_NULL((*fwd_out)[i],
-                            paddle::platform::errors::PreconditionNotMet(
-                                "Bwd_in_meta should only be called while "
-                                "autograd_meta is not null. If you got this "
-                                "error, it indicates bugs in framework."));
-    if ((*fwd_out)[i]->StopGradient()) {
-      // Set Stop Gradient only when its true or non-initialized autograd_meta,
-      // since all default value is false.
-      meta.SetStopGradient(i, (*fwd_out)[i]->StopGradient());
+  auto& metas = bwd_in_meta_.at(slot_rank);
+  if (metas.size() == 0) {
+    metas.resize(1);
+  }
+
+  auto& meta = metas[0];
+  meta.SetStopGradient(fwd_out_meta->StopGradient());
+
+  // Record TensorMeta
+  if (phi::DenseTensor::classof(fwd_out.impl().get())) {
+    // Only Copy Meta
+    phi::DenseTensor* dense_tensor =
+        static_cast<phi::DenseTensor*>(fwd_out.impl().get());
+
+    PADDLE_ENFORCE_NE(
+        dense_tensor->meta().dtype, phi::DataType::UNDEFINED,
+        paddle::platform::errors::Fatal(
+            "Attempting to copy DenseTensorMeta with phi::DataType::UNDEFINED,"
+            "which is illegal."));
+    meta.SetTensorMeta(dense_tensor->meta());
+
+    if (paddle::framework::IsComplexType(
+            paddle::framework::TransToProtoVarType(dense_tensor->type()))) {
+      need_complex_to_real_ = true;
     }
+  } else {
+    VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
+               "non-DenseTensor argument.";
   }
 }
 
-void GradNodeBase::SetGradInMeta(AutogradMeta* fwd_out, size_t slot_rank) {
+void GradNodeBase::SetGradInMeta(
+    const std::vector<paddle::experimental::Tensor>& fwd_out,
+    size_t slot_rank) {
+  size_t slot_size = fwd_out.size();
   PADDLE_ENFORCE_LE(
       slot_rank, (bwd_in_meta_.size() - 1),
       paddle::platform::errors::InvalidArgument(
           "Slot Rank should less equal than bwd_in_meta_ size, since "
           "bwd_in_meta_ is designed to hold as same num as backward "
           "inputs."));
-  auto& meta = bwd_in_meta_.at(slot_rank);
-  PADDLE_ENFORCE_EQ(meta.IsInitialized(), false,
-                    paddle::platform::errors::PreconditionNotMet(
-                        "Bwd_in_meta should only be init once, Additional "
-                        "initialization for it is forbidden. If you got this "
-                        "error, it indicates bugs in framework."));
+  auto& metas = bwd_in_meta_.at(slot_rank);
   // Init stop gradient vector before use to avoid push back
-  VLOG(7) << "Init bwd_in_meta_ with slot rank: " << slot_rank;
-  meta.Init(1);
-  meta.SetStopGradient(0, fwd_out->StopGradient());
+  if (metas.size() < slot_size) {
+    VLOG(7) << "Init bwd_in_meta_ with slot rank: " << slot_rank;
+    metas.resize(slot_size);
+  }
+  for (size_t i = 0; i < slot_size; i++) {
+    auto& meta = metas[i];
+    const auto& fwd_out_tensor = fwd_out[i];
+    auto* fwd_out_meta =
+        egr::EagerUtils::nullable_autograd_meta(fwd_out_tensor);
+    PADDLE_ENFORCE_NOT_NULL(fwd_out_meta,
+                            paddle::platform::errors::PreconditionNotMet(
+                                "Bwd_in_meta should only be called while "
+                                "autograd_meta is not null. If you got this "
+                                "error, it indicates bugs in framework."));
+    if (fwd_out_meta->StopGradient()) {
+      // Set Stop Gradient only when its true or non-initialized autograd_meta,
+      // since all default value is false.
+      meta.SetStopGradient(fwd_out_meta->StopGradient());
+    }
+
+    // Record TensorMeta
+    if (phi::DenseTensor::classof(fwd_out_tensor.impl().get())) {
+      // Only Copy Meta
+      phi::DenseTensor* dense_tensor =
+          static_cast<phi::DenseTensor*>(fwd_out_tensor.impl().get());
+
+      PADDLE_ENFORCE_NE(
+          dense_tensor->meta().dtype, phi::DataType::UNDEFINED,
+          paddle::platform::errors::Fatal("Attempting to copy DenseTensorMeta "
+                                          "with phi::DataType::UNDEFINED,"
+                                          "which is illegal."));
+      meta.SetTensorMeta(dense_tensor->meta());
+      if (paddle::framework::IsComplexType(
+              paddle::framework::TransToProtoVarType(dense_tensor->type()))) {
+        need_complex_to_real_ = true;
+      }
+    } else {
+      VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta "
+                 "with non-DenseTensor argument.";
+    }
+  }
 }
 
-void GradNodeBase::SetGradOutMeta(std::vector<AutogradMeta*>* fwd_in,
+void GradNodeBase::SetGradOutMeta(const paddle::experimental::Tensor& fwd_in,
                                   size_t slot_rank) {
-  size_t slot_size = fwd_in->size();
+  auto* fwd_in_meta = egr::EagerUtils::nullable_autograd_meta(fwd_in);
   PADDLE_ENFORCE_LE(
-      slot_rank, (bwd_out_meta_.size() - 1),
+      (slot_rank + 1), bwd_out_meta_.size(),
       paddle::platform::errors::InvalidArgument(
           "Slot Rank should less equal than bwd_out_meta_ size, "
           "since bwd_out_meta_ is designed to hold as same num as "
           "backward outputs."));
-  auto& meta = bwd_out_meta_.at(slot_rank);
-  PADDLE_ENFORCE_EQ(meta.IsInitialized(), false,
-                    paddle::platform::errors::PreconditionNotMet(
-                        "Bwd_out_meta should only be init once. Additional "
-                        "initialization for it is forbidden. If you got this "
-                        "error, it indicates bugs in framework."));
+  auto& metas = bwd_out_meta_.at(slot_rank);
   // Init stop gradient vector before use to avoid push back
-  meta.Init(slot_size);
-  for (size_t i = 0; i < slot_size; i++) {
-    if (!(*fwd_in)[i]) {
-      meta.SetStopGradient(i, true);
-      continue;
-    }
-    if ((*fwd_in)[i]->StopGradient()) {
-      // Set Stop Gradient only when its true or non-initialized autograd_meta,
-      // since all default value is false.
-      meta.SetStopGradient(i, (*fwd_in)[i]->StopGradient());
+  if (metas.size() == 0) {
+    metas.resize(1);
+  }
+  auto& meta = metas[0];
+  if (fwd_in_meta) {
+    meta.SetStopGradient(fwd_in_meta->StopGradient());
+  } else {
+    meta.SetStopGradient(true);
+  }
+
+  // Record TensorMeta
+  if (fwd_in.impl() && fwd_in.impl().get()) {
+    if (phi::DenseTensor::classof(fwd_in.impl().get())) {
+      // Only Copy Meta
+      phi::DenseTensor* dense_tensor =
+          static_cast<phi::DenseTensor*>(fwd_in.impl().get());
+      PADDLE_ENFORCE_NE(
+          dense_tensor->meta().dtype, phi::DataType::UNDEFINED,
+          paddle::platform::errors::Fatal("Attempting to copy DenseTensorMeta "
+                                          "with phi::DataType::UNDEFINED,"
+                                          "which is illegal."));
+      meta.SetTensorMeta(dense_tensor->meta());
     }
+  } else {
+    VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
+               "non-DenseTensor argument.";
   }
 }
 
-void GradNodeBase::SetGradOutMeta(AutogradMeta* fwd_in, size_t slot_rank) {
+void GradNodeBase::SetGradOutMeta(
+    const std::vector<paddle::experimental::Tensor>& fwd_in, size_t slot_rank) {
+  size_t slot_size = fwd_in.size();
   PADDLE_ENFORCE_LE(
-      (slot_rank + 1), bwd_out_meta_.size(),
+      slot_rank, (bwd_out_meta_.size() - 1),
       paddle::platform::errors::InvalidArgument(
           "Slot Rank should less equal than bwd_out_meta_ size, "
           "since bwd_out_meta_ is designed to hold as same num as "
           "backward outputs."));
-  auto& meta = bwd_out_meta_.at(slot_rank);
-  PADDLE_ENFORCE_EQ(meta.IsInitialized(), false,
-                    paddle::platform::errors::PreconditionNotMet(
-                        "Bwd_out_meta should only be init once. Additional "
-                        "initialization for it is forbidden. If you got this "
-                        "error, it indicates bugs in framework."));
+  auto& metas = bwd_out_meta_.at(slot_rank);
   // Init stop gradient vector before use to avoid push back
-  meta.Init(1);
-  if (fwd_in) {
-    meta.SetStopGradient(0, fwd_in->StopGradient());
-  } else {
-    meta.SetStopGradient(0, true);
+  if (metas.size() < slot_size) {
+    metas.resize(slot_size);
+  }
+  for (size_t i = 0; i < slot_size; i++) {
+    const auto& fwd_in_tensor = fwd_in[i];
+    auto& meta = metas[i];
+    auto* fwd_in_meta = egr::EagerUtils::nullable_autograd_meta(fwd_in_tensor);
+    if (fwd_in_meta) {
+      // Set Stop Gradient only when its true or non-initialized autograd_meta,
+      // since all default value is false.
+      meta.SetStopGradient(fwd_in_meta->StopGradient());
+    }
+
+    // Record TensorMeta
+    if (fwd_in_tensor.impl() && fwd_in_tensor.impl().get()) {
+      if (phi::DenseTensor::classof(fwd_in_tensor.impl().get())) {
+        // Only Copy Meta
+        phi::DenseTensor* dense_tensor =
+            static_cast<phi::DenseTensor*>(fwd_in_tensor.impl().get());
+
+        PADDLE_ENFORCE_NE(dense_tensor->meta().dtype, phi::DataType::UNDEFINED,
+                          paddle::platform::errors::Fatal(
+                              "Attempting to copy DenseTensorMeta with "
+                              "phi::DataType::UNDEFINED,"
+                              "which is illegal."));
+        meta.SetTensorMeta(dense_tensor->meta());
+      }
+    } else {
+      VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta "
+                 "with non-DenseTensor argument.";
+    }
   }
 }
 
@@ -207,12 +283,8 @@ void GradNodeBase::SetDefaultGradInOutMeta() {
                      "meta setter, other size of inputs and outputs should "
                      "create with Setter and Getters"));
   // Default stop_gradient is false and slot id is 0, slot size is 1;
-  bwd_out_meta_[0].Init(1);
-  bwd_in_meta_[0].Init(1);
-}
-
-const std::vector<std::vector<Edge>>& GradNodeBase::GetEdges() const {
-  return adj_edges_;
+  bwd_out_meta_[0].resize(1);
+  bwd_in_meta_[0].resize(1);
 }
 
 int64_t GradNodeBase::RegisterGradientHook(
@@ -222,6 +294,10 @@ int64_t GradNodeBase::RegisterGradientHook(
   return next_hook_id_++;
 }
 
+const std::vector<std::vector<Edge>>& GradNodeBase::GetEdges() const {
+  return adj_edges_;
+}
+
 std::vector<std::vector<paddle::experimental::Tensor>>
 GradNodeBase::ApplyGradientHooks(
     const std::vector<std::vector<paddle::experimental::Tensor>>& tensors) {
@@ -270,4 +346,45 @@ GradNodeBase::ApplyGradientHooks(
   return outs;
 }
 
+void GradNodeBase::HandleComplexGradToRealGrad(
+    std::vector<std::vector<paddle::experimental::Tensor>>* out_grads) {
+  for (size_t slot_id = 0; slot_id < out_grads->size(); slot_id++) {
+    const std::vector<paddle::experimental::Tensor>& slot_out_grads =
+        (*out_grads)[slot_id];
+    for (size_t rank_id = 0; rank_id < slot_out_grads.size(); rank_id++) {
+      const GradSlotMeta& slot_meta = bwd_out_meta_[slot_id][rank_id];
+
+      PADDLE_ENFORCE(
+          slot_meta.HasTensorMeta() > 0,
+          paddle::platform::errors::Fatal(
+              "We require TensorMeta in GradInputMeta() to obtain forward data "
+              "types."
+              "However, no TensorMeta is detected in bwd_out_meta_."));
+
+      auto fwd_data_type = paddle::framework::TransToProtoVarType(
+          slot_meta.GetTensorMeta().dtype);
+      const paddle::experimental::Tensor& grad = slot_out_grads[rank_id];
+
+      if (paddle::framework::IsComplexType(fwd_data_type)) continue;
+
+      // Only Handle Complex To Real for DenseTensor for now
+      if (phi::DenseTensor::classof(grad.impl().get())) {
+        phi::DenseTensor* grad_dense_tensor =
+            static_cast<phi::DenseTensor*>(grad.impl().get());
+
+        auto curr_data_type =
+            paddle::framework::TransToProtoVarType(grad_dense_tensor->type());
+        if (!paddle::framework::IsComplexType(curr_data_type)) continue;
+
+        // Convert Complex GradOut to Real
+        auto out = std::make_shared<phi::DenseTensor>();
+        paddle::framework::TransComplexToReal(fwd_data_type, curr_data_type,
+                                              *grad_dense_tensor, out.get());
+
+        (*out_grads)[slot_id][rank_id].set_impl(out);
+      }
+    }
+  }
+}
+
 }  // namespace egr
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index 16513f05e0777..4b21a193ee021 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -57,21 +57,28 @@ class AutogradMeta;
 class GradSlotMeta {
  public:
   GradSlotMeta() = default;
-  void Init(size_t size) {
-    size_ = static_cast<int>(size);
-    stop_gradient_.resize(size, false);
+  bool IsStopGradient() const { return stop_gradient_; }
+  void SetStopGradient(bool stop_gradient = true) {
+    stop_gradient_ = stop_gradient;
   }
 
-  bool IsInitialized() const { return size_ != -1; }
-  bool IsStopGradient(size_t rank) const { return stop_gradient_[rank]; }
-  int Size() const { return size_; }
-  void SetStopGradient(size_t rank, bool stop_gradient = true) {
-    stop_gradient_.at(rank) = stop_gradient;
+  void SetTensorMeta(const phi::DenseTensorMeta& meta) {
+    meta_ = std::make_shared<phi::DenseTensorMeta>(meta);
+  }
+  bool HasTensorMeta() const { return meta_ && meta_.get(); }
+  const phi::DenseTensorMeta& GetTensorMeta() const {
+    if (!HasTensorMeta()) {
+      PADDLE_THROW(paddle::platform::errors::Fatal(
+          "meta_ of GradSlotMeta has not been initialized yet."
+          "You're expected to check Edge availability with HasTensorMeta()"
+          "before calling GetTensorMeta() interface."));
+    }
+    return *meta_.get();
   }
 
  private:
-  int size_{-1};
-  std::vector<bool> stop_gradient_{false};
+  bool stop_gradient_{false};
+  std::shared_ptr<phi::DenseTensorMeta> meta_ = nullptr;
 };
 
 class GradNodeBase {
@@ -95,8 +102,12 @@ class GradNodeBase {
    * is better choice to fit this format.
    * **/
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads) = 0;
+      const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      bool create_graph = false) = 0;
+
+  virtual void ClearTensorWrappers() = 0;
 
+  virtual bool IsTensorWrappersCleared() = 0;
   /**
    * AddEdges is designed to set input tensors' backward Node as current
    * node's Edges.
@@ -108,25 +119,30 @@ class GradNodeBase {
   void AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id);
   void AddEdges(AutogradMeta* meta, size_t slot_id);
 
-  /**
-   * GetEdges is designed to get all edges of current node**/
-  const std::vector<std::vector<Edge>>& GetEdges() const;
+  // adj_edges were moved inside OutputMeta(), so no available direct access
+  // from GradNodeBase.
+  // To access Edges, get GradSlotMeta by calling OutputMeta(), then use
+  // slot_meta.GetEdge()
 
   /**
    * Get Input Meta of current Grad node**/
-  const std::vector<GradSlotMeta>& InputMeta() const;
+  const std::vector<std::vector<GradSlotMeta>>& InputMeta() const;
   /**
    * Get Output Meta of current Grad node**/
-  const std::vector<GradSlotMeta>& OutputMeta() const;
+  const std::vector<std::vector<GradSlotMeta>>& OutputMeta() const;
   /**
    * Set bwd ins and outs info with forward vars
    * **/
 
-  void SetGradInMeta(std::vector<AutogradMeta*>* fwd_out, size_t slot_rank);
-  void SetGradInMeta(AutogradMeta* fwd_out, size_t slot_rank);
+  void SetGradInMeta(const std::vector<paddle::experimental::Tensor>& fwd_out,
+                     size_t slot_rank);
+  void SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
+                     size_t slot_rank);
 
-  void SetGradOutMeta(std::vector<AutogradMeta*>* fwd_in, size_t slot_rank);
-  void SetGradOutMeta(AutogradMeta* fwd_in, size_t slot_rank);
+  void SetGradOutMeta(const std::vector<paddle::experimental::Tensor>& fwd_in,
+                      size_t slot_rank);
+  void SetGradOutMeta(const paddle::experimental::Tensor& fwd_in,
+                      size_t slot_rank);
 
   /**
    * Default setters for Grad in/out meta this should be used for same special
@@ -158,11 +174,21 @@ class GradNodeBase {
   std::vector<std::vector<paddle::experimental::Tensor>> ApplyGradientHooks(
       const std::vector<std::vector<paddle::experimental::Tensor>>& tensors);
 
+  /**
+    * Handle Complex - Real Type Promotion
+    * **/
+  void HandleComplexGradToRealGrad(
+      std::vector<std::vector<paddle::experimental::Tensor>>* out_grads);
+  bool NeedComplexToRealConversion() { return need_complex_to_real_; }
+
   virtual std::string name() { return "GradNodeBase"; }
 
- private:
-  // TODO(jiabin): Use SmallVector instead after merge PR from develop
+  /**
+       * GetEdges is designed to get all edges of current node**/
+  const std::vector<std::vector<Edge>>& GetEdges() const;
 
+ private:
+  // TODO(zhanlve): Merge adj_edges_ into GradOutMeta
   // Edges recorded the backward related node info, which indicate all edges
   // linked
   // by this Grad Node.
@@ -170,10 +196,10 @@ class GradNodeBase {
   std::vector<std::vector<Edge>> adj_edges_;
 
   // bwd_out_meta_ is used to record Grad output info for backward
-  std::vector<GradSlotMeta> bwd_out_meta_;
+  std::vector<std::vector<GradSlotMeta>> bwd_out_meta_;
 
   // bwd_in_meta_ used to record Grad input info for backward
-  std::vector<GradSlotMeta> bwd_in_meta_;
+  std::vector<std::vector<GradSlotMeta>> bwd_in_meta_;
   // Gradient Hooks
   // Customer may register a list of hooks which will be called in order during
   // backward
@@ -184,6 +210,8 @@ class GradNodeBase {
                         /* hook */ std::shared_ptr<TensorHook>>>
       gradient_hooks_;
 
+  // We handle complex to real conversion only if any complex GradIn is involved
+  bool need_complex_to_real_ = false;
   int64_t next_hook_id_{0};
 };
 
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index 69fc7df2f1420..038ad09aa4d8b 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -21,6 +21,11 @@
 
 namespace egr {
 
+void GradTensorHolder::SetBufferSlotRankZeros(size_t slot_id, size_t rank) {
+  buffer_[slot_id][rank] =
+      paddle::experimental::zeros_like(buffer_[slot_id][rank]);
+}
+
 void GradTensorHolder::add(size_t slot_id, size_t rank,
                            const paddle::experimental::Tensor& t,
                            bool fill_one) {
@@ -88,7 +93,7 @@ void GradTensorHolder::add(size_t slot_id, size_t rank,
     // Create new tensor->impl and fill it with 1.0
     if (t.defined()) {
       // Fill 1.0
-      buffer_[slot_id][rank] = paddle::experimental::ones_like(t);
+      buffer_[slot_id][rank] = paddle::experimental::ones_like(t, t.dtype());
     }
   }
 }
diff --git a/paddle/fluid/eager/grad_tensor_holder.h b/paddle/fluid/eager/grad_tensor_holder.h
index d66a81fe82859..8c00f9161b629 100644
--- a/paddle/fluid/eager/grad_tensor_holder.h
+++ b/paddle/fluid/eager/grad_tensor_holder.h
@@ -26,12 +26,13 @@ namespace egr {
  * GradTensorHolder should have as same format as forward output **/
 class GradTensorHolder {
  public:
-  explicit GradTensorHolder(const std::vector<GradSlotMeta>& meta) {
-    VLOG(7) << "Init GradTensorHolder with meta size: " << meta.size();
-    buffer_.resize(meta.size());
+  explicit GradTensorHolder(
+      const std::vector<std::vector<GradSlotMeta>>& metas) {
+    VLOG(7) << "Init GradTensorHolder with meta size: " << metas.size();
+    buffer_.resize(metas.size());
     for (size_t i = 0; i < buffer_.size(); i++) {
-      VLOG(7) << "Init GradTensorHolder with meta rank: " << meta[i].Size();
-      buffer_[i].resize(meta[i].Size());
+      VLOG(7) << "Init GradTensorHolder with meta rank: " << metas[i].size();
+      buffer_[i].resize(metas[i].size());
     }
   }
 
@@ -56,6 +57,8 @@ class GradTensorHolder {
     return buffer_;
   }
 
+  void SetBufferSlotRankZeros(size_t slot_id, size_t rank);
+
  private:
   std::vector<std::vector<paddle::experimental::Tensor>> buffer_;
 };
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index 31aaa93c41643..8da27f3bb8a13 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -36,6 +36,15 @@ class TensorWrapper {
   explicit TensorWrapper(const paddle::experimental::Tensor& tensor,
                          bool full_reserved = false,
                          bool no_need_buffer = false) {
+    // set inplace_version_snapshot_ according to tensor's current inplace
+    // version.
+    if (tensor.impl() && phi::DenseTensor::classof(tensor.impl().get())) {
+      phi::DenseTensor* dense_tensor =
+          static_cast<phi::DenseTensor*>(tensor.impl().get());
+      auto& inplace_version_counter = dense_tensor->InplaceVersionCounter();
+      inplace_version_snapshot_ = inplace_version_counter.CurrentVersion();
+    }
+
     /**
      * Normally, we should fully reserved all non-output or non-leaf fwd tensor
      * here. And for fwd output tensor, we should not reserve its autogradmeta,
@@ -49,6 +58,7 @@ class TensorWrapper {
     }
 
     // shallow copy tensor_impl here
+    no_need_buffer_ = no_need_buffer;
     if (no_need_buffer) {
       if (phi::DenseTensor::classof(tensor.impl().get())) {
         // Only Copy Meta
@@ -86,6 +96,7 @@ class TensorWrapper {
 
     // if it's full_reserved just return the full copy of tensor
     if (full_reserved_) {
+      check_inplace_version();
       return intermidiate_tensor_;
     } else {
       std::shared_ptr<GradNodeBase> new_grad_node = grad_node;
@@ -94,13 +105,52 @@ class TensorWrapper {
       intermidiate_tensor_.set_autograd_meta(
           std::static_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
               p_ab_autograd_meta));
+      check_inplace_version();
       return intermidiate_tensor_;
     }
   }
 
+  void check_inplace_version() {
+    if (no_need_buffer_) {
+      VLOG(6) << "There's no need to check inplace_version because "
+                 "no_need_buffer_ is true.";
+      return;
+    }
+    if (intermidiate_tensor_.impl() &&
+        phi::DenseTensor::classof(intermidiate_tensor_.impl().get())) {
+      phi::DenseTensor* dense_tensor =
+          static_cast<phi::DenseTensor*>(intermidiate_tensor_.impl().get());
+      auto& inplace_version_counter = dense_tensor->InplaceVersionCounter();
+
+      uint32_t current_inplace_version =
+          inplace_version_counter.CurrentVersion();
+      PADDLE_ENFORCE_EQ(
+          current_inplace_version, inplace_version_snapshot_,
+          paddle::platform::errors::PermissionDenied(
+              "Tensor '%s' used in gradient computation has been "
+              "modified by an inplace operation. "
+              "Its version is %d but the expected version is %d. "
+              "Please fix your code to void calling an inplace operator "
+              "after using the Tensor which will used in gradient "
+              "computation.",
+              intermidiate_tensor_.name(), current_inplace_version,
+              inplace_version_snapshot_));
+      VLOG(6) << " The inplace_version_snapshot_ of Tensor '"
+              << intermidiate_tensor_.name() << "' is [ "
+              << inplace_version_snapshot_ << " ]";
+      VLOG(6) << " The current_inplace_version of Tensor '"
+              << intermidiate_tensor_.name() << "' is [ "
+              << current_inplace_version << " ]";
+    }
+  }
+
+  void clear() { intermidiate_tensor_.reset(); }
+
  private:
   bool full_reserved_ = false;
+  bool no_need_buffer_ = false;
   std::pair<size_t, size_t> out_rank_info_;
   paddle::experimental::Tensor intermidiate_tensor_;
+  uint32_t inplace_version_snapshot_ = 0;
 };
 }  // namespace egr
diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
index 1683f4ed5fbe5..c8b2d22dcf951 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
@@ -17,6 +17,14 @@
 
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy_sr, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy_sr, GPU, ALL_LAYOUT);
+#endif
 
 namespace eager_test {
 using AbstractAutogradMeta = paddle::experimental::AbstractAutogradMeta;
@@ -151,5 +159,50 @@ TEST(EagerVariable, Constructor) {
   CHECK_EQ(dt3_tmp_ptr[1], 10.0f);
   t4.reset();
   CHECK(t4.defined() == false);
+
+  VLOG(6) << "Check Tensor Copy_";
+  std::vector<int64_t> rows = {1, 2};
+  std::vector<int64_t> dims = {2};
+  paddle::experimental::Tensor t7(std::make_shared<phi::SelectedRows>(rows, 2));
+  std::dynamic_pointer_cast<phi::SelectedRows>(t7.impl())
+      ->mutable_value()
+      ->Resize(phi::make_ddim(dims));
+  auto* dt7_tmp_ptr = std::dynamic_pointer_cast<phi::SelectedRows>(t7.impl())
+                          ->mutable_value()
+                          ->mutable_data<float>(paddle::platform::CPUPlace());
+  dt7_tmp_ptr[0] = 6.0f;
+  dt7_tmp_ptr[1] = 11.0f;
+
+  paddle::experimental::Tensor t8;
+  paddle::experimental::Tensor t5;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  paddle::experimental::Tensor t6;
+  paddle::experimental::Tensor t9;
+  VLOG(6) << "Check Tensor Copy_ Selected Rows";
+  t8.copy_(t7, paddle::platform::CUDAPlace(0), true);
+  t9.copy_(t8, paddle::platform::CPUPlace(), true);
+  auto* dt9_tmp_ptr = std::dynamic_pointer_cast<phi::SelectedRows>(t9.impl())
+                          ->value()
+                          .data<float>();
+  CHECK_EQ(dt9_tmp_ptr[0], 6.0f);
+  CHECK_EQ(dt9_tmp_ptr[1], 11.0f);
+  CHECK_EQ(std::dynamic_pointer_cast<phi::SelectedRows>(t9.impl())->height(),
+           2);
+
+  VLOG(6) << "Check Tensor Copy_ Dense Tensor";
+  t5.copy_(t3, paddle::platform::CUDAPlace(0), true);
+  t6.copy_(t5, paddle::platform::CPUPlace(), true);
+  auto* dt6_tmp_ptr =
+      std::dynamic_pointer_cast<phi::DenseTensor>(t6.impl())->data<float>();
+  CHECK_EQ(dt6_tmp_ptr[0], 5.0f);
+  CHECK_EQ(dt6_tmp_ptr[1], 10.0f);
+#else
+  t5.copy_(t3, paddle::platform::CPUPlace(), true);
+  auto* dt5_tmp_ptr =
+      std::dynamic_pointer_cast<phi::DenseTensor>(t5.impl())->data<float>();
+  CHECK_EQ(dt5_tmp_ptr[0], 5.0f);
+  CHECK_EQ(dt5_tmp_ptr[1], 10.0f);
+#endif
+
   VLOG(6) << "Finish";
 }
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
index e3db309c4016a..d592b5ccf66ff 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 
@@ -23,14 +24,9 @@
 
 TEST(GradNodeInfo, GradSlotMeta) {
   auto grad_slot = egr::GradSlotMeta();
-  CHECK(grad_slot.IsInitialized() == false);
-  VLOG(6) << "Init GradSlotMeta";
-  grad_slot.Init(2);
-  CHECK(grad_slot.IsInitialized() == true);
   VLOG(6) << "Set SetStopGradient";
-  grad_slot.SetStopGradient(0);
-  CHECK(grad_slot.IsStopGradient(0) == true);
-  CHECK_EQ(grad_slot.Size(), 2);
+  grad_slot.SetStopGradient();
+  CHECK(grad_slot.IsStopGradient() == true);
 }
 
 void TestGradNodeBase(bool is_remove_gradient_hook) {
@@ -56,18 +52,22 @@ void TestGradNodeBase(bool is_remove_gradient_hook) {
                ->data<float>()[0],
            6.0f);
   VLOG(6) << "Test Add Edges";
-  egr::Edge edge0(grad_test_node1, 1, 2);
-  auto auto_grad0 = std::make_shared<egr::AutogradMeta>(edge0);
+  egr::Edge tmp_edge0(grad_test_node1, 1, 2);
+  auto auto_grad0 = std::make_shared<egr::AutogradMeta>(tmp_edge0);
   auto_grad0->SetStopGradient(false);
-  egr::Edge edge1(grad_test_node1, 3, 4);
-  auto auto_grad1 = std::make_shared<egr::AutogradMeta>(edge1);
+
+  egr::Edge tmp_edge1(grad_test_node1, 3, 4);
+  auto auto_grad1 = std::make_shared<egr::AutogradMeta>(tmp_edge1);
+  et1.set_autograd_meta(auto_grad1);
   auto_grad1->SetStopGradient(false);
   grad_test_node0->AddEdges(auto_grad0.get(), 0);
+
   CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().first,
            size_t(1));
   CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().second,
            size_t(2));
   std::vector<egr::AutogradMeta*> metas = {auto_grad1.get()};
+
   grad_test_node0->AddEdges(&metas, 1);
   CHECK_EQ(grad_test_node0->GetEdges()[1][0].GetEdgeRankInfo().first,
            size_t(3));
@@ -76,22 +76,30 @@ void TestGradNodeBase(bool is_remove_gradient_hook) {
 
   VLOG(6) << "Test Set Meta and Get Meta";
   auto_grad1->SetStopGradient(true);
-  grad_test_node0->SetGradInMeta(&metas, 0);
-  grad_test_node0->SetGradInMeta(auto_grad1.get(), 1);
-  grad_test_node0->SetGradOutMeta(&metas, 0);
-  grad_test_node0->SetGradOutMeta(auto_grad1.get(), 1);
-  CHECK_EQ(grad_test_node0->InputMeta()[0].Size(), 1);
-  CHECK_EQ(grad_test_node0->InputMeta()[1].Size(), 1);
-  CHECK(grad_test_node0->OutputMeta()[0].IsStopGradient(0));
-  CHECK(grad_test_node0->OutputMeta()[1].IsStopGradient(0));
+  grad_test_node0->SetGradInMeta(et1, 0);
+  grad_test_node0->SetGradInMeta({et1}, 1);
+  grad_test_node0->SetGradOutMeta(et1, 0);
+  grad_test_node0->SetGradOutMeta({et1}, 1);
+  CHECK_EQ(grad_test_node0->InputMeta()[0].size(), size_t(1));
+  CHECK_EQ(grad_test_node0->InputMeta()[1].size(), size_t(1));
+  CHECK_EQ(grad_test_node0->InputMeta()[0][0].GetTensorMeta().dtype,
+           meta.dtype);
+  CHECK_EQ(grad_test_node0->InputMeta()[1][0].GetTensorMeta().dtype,
+           meta.dtype);
+  CHECK(grad_test_node0->OutputMeta()[0][0].IsStopGradient());
+  CHECK(grad_test_node0->OutputMeta()[1][0].IsStopGradient());
+  CHECK_EQ(grad_test_node0->OutputMeta()[0][0].GetTensorMeta().dtype,
+           meta.dtype);
+  CHECK_EQ(grad_test_node0->OutputMeta()[1][0].GetTensorMeta().dtype,
+           meta.dtype);
 
   VLOG(6) << "Test Default Set Meta and Get Meta";
   auto grad_test_node2 = std::make_shared<eager_test::GradTestNode>(
       /* val */ 5.0, /* in_num */ 1, /* out_num */ 1);
   grad_test_node2->SetDefaultGradInOutMeta();
-  CHECK(grad_test_node2->OutputMeta()[0].IsInitialized());
-  CHECK(grad_test_node2->OutputMeta()[0].IsStopGradient(0) == false);
-  CHECK_EQ(grad_test_node2->OutputMeta()[0].Size(), 1);
+  CHECK_GT(grad_test_node2->OutputMeta()[0].size(), size_t(0));
+  CHECK(grad_test_node2->OutputMeta()[0][0].IsStopGradient() == false);
+  CHECK_EQ(grad_test_node2->OutputMeta()[0].size(), size_t(1));
 
   VLOG(6) << "Test Gradient Hook";
   auto gradient_hook = [](
@@ -135,7 +143,17 @@ TEST(GradNodeInfo, GradNodeBase) {
 }
 
 TEST(GradNodeInfo, Edge) {
+  phi::DenseTensorMeta meta =
+      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1}));
+  std::shared_ptr<phi::DenseTensor> dt = std::make_shared<phi::DenseTensor>(
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
+      meta);
+  paddle::experimental::Tensor et1(dt);
+
   auto grad_test_node0 = std::make_shared<eager_test::GradTestNode>(5, 2, 2);
+  auto auto_grad1 = std::make_shared<egr::AutogradMeta>();
   VLOG(6) << "Test Construct Edge";
   egr::Edge edge0 = egr::Edge();
   CHECK(edge0.IsInitialized() == false);
@@ -145,13 +163,12 @@ TEST(GradNodeInfo, Edge) {
       egr::Edge(grad_test_node0, std::make_pair(size_t(1), size_t(0)));
   VLOG(6) << "Test Set Edge's Grad Node";
   auto* grad_node = edge1.GetGradNode();
+  et1.set_autograd_meta(auto_grad1);
+  grad_node->SetGradInMeta(et1, 0);
+
   CHECK_EQ(grad_node->InputMeta().size(), size_t(2));
-  auto mt_grad_node = edge1.GetMutableGradNode();
-  auto auto_grad1 = std::make_shared<egr::AutogradMeta>();
   std::vector<egr::AutogradMeta*> metas = {auto_grad1.get()};
-  // Uninitialized AutogradMeta indicates
-  mt_grad_node->SetGradInMeta(&metas, 0);
-  CHECK(grad_node->InputMeta()[0].IsStopGradient(0) == true);
+  CHECK(grad_node->InputMeta()[0][0].IsStopGradient() == true);
   VLOG(6) << "Test Get/Set Edge Rank Info";
   CHECK_EQ(edge2.GetEdgeRankInfo().first, size_t(1));
   CHECK_EQ(edge2.GetEdgeRankInfo().second, size_t(0));
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
index 535c93ac53b17..0b167203735d6 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
@@ -32,8 +32,8 @@ class GradTestNode : public egr::GradNodeBase {
   GradTestNode() : GradNodeBase() { val_ = 1.0; }
   std::string name() override { return "GradTestNode"; }
   std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
-      override {
+      const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      bool create_graph = false) override {
     val_ = std::dynamic_pointer_cast<phi::DenseTensor>(grads[0][0].impl())
                ->data<float>()[0];
     phi::DenseTensorMeta meta =
@@ -49,6 +49,11 @@ class GradTestNode : public egr::GradNodeBase {
     std::vector<std::vector<paddle::experimental::Tensor>> res = {{et1}};
     return res;
   }
+  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+  bool IsTensorWrappersCleared() override {
+    VLOG(6) << "Do nothing here now";
+    return false;
+  }
   float val_;
 };
 }  // namespace eager_test
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
index 8c6eeca9d3d5d..645eac06ddda5 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
@@ -24,12 +24,13 @@
 
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT);
+
 // TODO(jiabin): remove nolint here!!!
 using namespace egr;  // NOLINT
 
 TEST(GradTensorHolder, Constructor) {
-  GradSlotMeta slot_meta;
-  slot_meta.Init(1);
+  std::vector<GradSlotMeta> slot_meta(1);
   GradTensorHolder grad_tensor_holder = GradTensorHolder({slot_meta});
   GradTensorHolder grad_tensor_holder2 = GradTensorHolder(grad_tensor_holder);
 
@@ -70,8 +71,7 @@ TEST(GradTensorHolder, Interfaces) {
   paddle::experimental::Tensor et1 = paddle::experimental::Tensor(dt1);
 
   // Constructor empty GradTensorHolder
-  GradSlotMeta slot_meta;
-  slot_meta.Init(1);
+  std::vector<GradSlotMeta> slot_meta(1);
   GradTensorHolder grad_tensor_holder =
       GradTensorHolder({slot_meta, slot_meta});
 
@@ -136,8 +136,7 @@ TEST(GradTensorHolder, SelectedRowsMergeAdd) {
   paddle::experimental::Tensor t2(sr2);
 
   // Constructor empty GradTensorHolder
-  GradSlotMeta slot_meta;
-  slot_meta.Init(1);
+  std::vector<GradSlotMeta> slot_meta(1);
   GradTensorHolder grad_tensor_holder =
       GradTensorHolder({slot_meta, slot_meta});
 
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
index 6c4bf9a4f17e6..056c7102f663b 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -33,6 +33,16 @@
 #include "gperftools/profiler.h"
 #endif
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT);
+
 using namespace egr;            // NOLINT
 using namespace egr_utils_api;  // NOLINT
 
@@ -72,6 +82,47 @@ TEST(Benchmark, EagerScaleCPU) {
   }
 }
 
+TEST(Benchmark, EagerMatmulCPU) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  for (const std::string& mode : {"Accuracy", "Performance"}) {
+    paddle::framework::DDim ddimX = phi::make_ddim({2, 2});
+    paddle::experimental::Tensor X = CreateTensorWithValue(
+        ddimX, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+        phi::DataLayout::NCHW, 1.0, true);
+    RetainGradForTensor(X);
+
+    paddle::framework::DDim ddimY = phi::make_ddim({2, 2});
+    paddle::experimental::Tensor Y = CreateTensorWithValue(
+        ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+        phi::DataLayout::NCHW, 2.0, true);
+    RetainGradForTensor(Y);
+
+    if (mode == "Accuracy") {
+      benchmark_eager_matmul(X, Y, true /* accuracy_check */);
+
+    } else if (mode == "Performance") {
+      auto t_start = std::chrono::high_resolution_clock::now();
+#ifdef WITH_GPERFTOOLS
+      ProfilerStart("eager_matmul_cpu.out");
+#endif
+      benchmark_eager_matmul(X, Y);
+
+#ifdef WITH_GPERFTOOLS
+      ProfilerStop();
+#endif
+      auto t_end = std::chrono::high_resolution_clock::now();
+      double elapsed_time_ms =
+          std::chrono::duration<double, std::milli>(t_end - t_start).count();
+      std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
+
+    } else {
+      PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
+    }
+  }
+}
+
 TEST(Benchmark, EagerIntermediateMatmulCPU) {
   // Prepare Device Contexts
   eager_test::InitEnv(paddle::platform::CPUPlace());
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index 14e7ce8cfcfb4..5e790389819f5 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -32,11 +32,21 @@
 #include "gperftools/profiler.h"
 #endif
 
+#include "paddle/phi/core/kernel_registry.h"
+
 using namespace egr;            // NOLINT
 using namespace egr_utils_api;  // NOLINT
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
+PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
+
 TEST(Benchmark, EagerScaleCUDA) {
   eager_test::InitEnv(paddle::platform::CUDAPlace());
 
@@ -74,6 +84,50 @@ TEST(Benchmark, EagerScaleCUDA) {
   }
 }
 
+TEST(Benchmark, EagerMatmulCUDA) {
+  paddle::platform::CUDAPlace place;
+  eager_test::InitEnv(place);
+
+  for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
+    paddle::framework::DDim ddimX = phi::make_ddim({2, 2});
+    paddle::experimental::Tensor X = CreateTensorWithValue(
+        ddimX, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32,
+        phi::DataLayout::NCHW, 1.0, true);
+    RetainGradForTensor(X);
+
+    paddle::framework::DDim ddimY = phi::make_ddim({2, 2});
+    paddle::experimental::Tensor Y = CreateTensorWithValue(
+        ddimY, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32,
+        phi::DataLayout::NCHW, 2.0, true);
+    RetainGradForTensor(Y);
+
+    if (mode == "Accuracy") {
+      benchmark_eager_matmul(X, Y, true /* accuracy_check */);
+
+    } else if (mode == "WarmUp") {
+      benchmark_eager_matmul(X, Y);
+
+    } else if (mode == "Performance") {
+      auto t_start = std::chrono::high_resolution_clock::now();
+#ifdef WITH_GPERFTOOLS
+      ProfilerStart("eager_matmul_cuda.out");
+#endif
+      benchmark_eager_matmul(X, Y);
+
+#ifdef WITH_GPERFTOOLS
+      ProfilerStop();
+#endif
+      auto t_end = std::chrono::high_resolution_clock::now();
+      double elapsed_time_ms =
+          std::chrono::duration<double, std::milli>(t_end - t_start).count();
+      std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
+
+    } else {
+      PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
+    }
+  }
+}
+
 TEST(Benchmark, EagerIntermediateMatmulCUDA) {
   paddle::platform::CUDAPlace place;
   eager_test::InitEnv(place);
@@ -186,7 +240,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
 USE_OP_ITSELF(scale);
 USE_OP_ITSELF(matmul_v2);
 USE_OP_ITSELF(reduce_sum);
-USE_OP(reduce_sum_grad);
+USE_OP_ITSELF(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);
 
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
index 3292de9363696..b4b47a85f6666 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
@@ -34,6 +34,16 @@
 #include "gperftools/profiler.h"
 #endif
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace imperative {
 
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
index e9b7d10070dbf..a3e393b039425 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -34,8 +34,18 @@
 #include "gperftools/profiler.h"
 #endif
 
+#include "paddle/phi/core/kernel_registry.h"
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
+PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace imperative {
 
@@ -248,7 +258,7 @@ TEST(Benchmark, FluidMLPCUDA) {
 USE_OP_ITSELF(scale);
 USE_OP_ITSELF(matmul_v2);
 USE_OP_ITSELF(reduce_sum);
-USE_OP(reduce_sum_grad);
+USE_OP_ITSELF(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);
 
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
index 96126fa5466aa..c8fb6050e9d45 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
@@ -28,6 +28,7 @@
 #include "paddle/fluid/eager/utils.h"
 
 // Eager Generated
+#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 
 // Fluid
@@ -36,7 +37,7 @@
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/memory/memcpy.h"
 
-static size_t max_num_benchmark_runs = 5000;
+static size_t max_num_benchmark_runs = 4000;
 
 namespace egr {
 
@@ -57,7 +58,7 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor,
   }
 
   std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   if (accuracy_check) {
     // Examine Forward Grad (w.r.t max_num_runs = 10)
@@ -67,6 +68,29 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor,
   }
 }
 
+void benchmark_eager_matmul(const paddle::experimental::Tensor& X,
+                            const paddle::experimental::Tensor& Y,
+                            bool accuracy_check) {
+  paddle::experimental::Tensor input_tensor0 = X;
+
+  size_t max_num_runs = accuracy_check ? 2 : max_num_benchmark_runs;
+  for (size_t i = 0; i < max_num_runs; i++) {
+    input_tensor0 =
+        matmul_final_state_dygraph_function(input_tensor0, Y, false, false);
+  }
+
+  std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
+  Backward(target_tensors, {});
+
+  if (accuracy_check) {
+    // Examine Forward Grad (w.r.t max_num_runs = 2)
+    eager_test::CompareTensorWithValue<float>(input_tensor0, 16);
+    // Examine Backward Grad (w.r.t max_num_runs = 2)
+    eager_test::CompareGradTensorWithValue<float>(X, 16);
+    eager_test::CompareGradTensorWithValue<float>(Y, 16);
+  }
+}
+
 /* ----------------------------------- */
 /* ---- Eager Intermediate Matmul ---- */
 /* ----------------------------------- */
@@ -82,7 +106,7 @@ void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X,
   }
 
   std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   if (accuracy_check) {
     // Examine Forward Grad (w.r.t max_num_runs = 2)
@@ -113,7 +137,7 @@ void benchmark_eager_intermediate_mlp(
       reduce_sum_dygraph_function(input0, {{"reduce_all", true}});
 
   std::vector<paddle::experimental::Tensor> target_tensors = {Out};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   if (accuracy_check) {
     std::unordered_map<std::string, float> result =
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
index 0086b51b57e15..86bf13707ed40 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
@@ -51,15 +51,10 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor,
                            bool accuracy_check = false);
 
 /* ---- Eager MatMul ---- */
-/*
-void benchmark_eager_matmul(const paddle::experimental::Tensor& X, const
-paddle::experimental::Tensor& Y,
+void benchmark_eager_matmul(const paddle::experimental::Tensor& X,
+                            const paddle::experimental::Tensor& Y,
                             bool accuracy_check = false);
-void benchmark_eager_mlp(const paddle::experimental::Tensor& X,
-                         const std::vector<paddle::experimental::Tensor>& Ws,
-                         const std::vector<paddle::experimental::Tensor>& Bs,
-                         bool accuracy_check = false);
-*/
+
 void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X,
                                          const paddle::experimental::Tensor& Y,
                                          bool accuracy_check = false);
diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
index c65ad4641cf22..52dba6b9218c7 100644
--- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
+++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
@@ -5,6 +5,7 @@ cc_test(test_egr_task_backward SRCS backward_test.cc DEPS ${eager_deps} ${fluid_
 cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
 cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
 cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
+cc_test(test_egr_task_grad SRCS grad_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
 
 if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     cc_test(test_egr_task_hook_intermidiate SRCS hook_test_intermidiate.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} dygraph_node)
diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc
index a4bc56bd606f3..87f8f6eca1f88 100644
--- a/paddle/fluid/eager/tests/task_tests/backward_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc
@@ -30,6 +30,11 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_meta.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 TEST(Backward, SingleNodeEmptyGrad) {
@@ -75,7 +80,7 @@ TEST(Backward, SingleNodeEmptyGrad) {
   }
   std::vector<paddle::experimental::Tensor> outs = {target_tensor};
   // Run Backward
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Check Output Value
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
@@ -134,7 +139,7 @@ TEST(Backward, SingleNodeCustomGrad) {
   }
 
   // Run Backward
-  RunBackward(target_tensors, grad_tensors);
+  Backward(target_tensors, grad_tensors);
 
   // Check Output Value
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
@@ -207,7 +212,7 @@ TEST(Backward, LinearNodes) {
   }
 
   // Use Empty Grad Tensor
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   // Check Output Value
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
@@ -311,7 +316,7 @@ TEST(Backward, WithAccumulation) {
     node2_ptr->AddEdges(&res2, 0);
   }
 
-  RunBackward(target_tensors, grad_tensors);
+  Backward(target_tensors, grad_tensors);
 
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 2500.0);
 }
diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
index 524872b2e5563..8b0759c17ed37 100644
--- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
@@ -31,6 +31,10 @@
 
 #include "paddle/fluid/eager/tests/test_utils.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 TEST(CrossBatchAccumulation, SingleScaleNode) {
@@ -67,12 +71,12 @@ TEST(CrossBatchAccumulation, SingleScaleNode) {
   std::vector<egr::AutogradMeta*> res = {meta};
   scale_node_ptr->AddEdges(&res, 0);
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 10.0);
diff --git a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
index 49bbfc77741a5..dc44d95daac1d 100644
--- a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
@@ -27,6 +27,10 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_meta.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 TEST(Forward, SingleNode) {
diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
index 5a7bafb2fe370..882695e98d109 100644
--- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
@@ -30,6 +30,13 @@
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+#endif
+
 namespace egr {
 
 paddle::experimental::Tensor hook_function(
@@ -79,7 +86,7 @@ TEST(FwdBwdJoint, SingleNode) {
 
   std::vector<paddle::experimental::Tensor> outs = {out};
   // 4. Run Backward
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   VLOG(7) << "Target Grad is: "
           << std::static_pointer_cast<phi::DenseTensor>(
@@ -130,7 +137,7 @@ TEST(FwdBwdJoint, LinearNodes) {
 
   std::vector<paddle::experimental::Tensor> outs = {out1};
   // 4. Run Backward
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 10.0);
@@ -196,7 +203,7 @@ TEST(FwdBwdJoint, BranchedNodes) {
 
   // 4. Run Backward
   std::vector<paddle::experimental::Tensor> outs = {out1, out2};
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
@@ -253,7 +260,7 @@ TEST(FwdBwdJoint, GradientHook) {
 
   // 4. Run Backward
   std::vector<paddle::experimental::Tensor> outs = {out1, out2};
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   // leaf grad
@@ -311,13 +318,13 @@ TEST(FwdBwdJoint, CrossBatchAccumulation) {
 
   // 4. Run Backward
   std::vector<paddle::experimental::Tensor> outs = {out1, out2};
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
 
   // Cross Batch Accumulation
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 60.0);
@@ -349,7 +356,7 @@ TEST(FwdBwdJoint, SingleNodeCUDA) {
 
   std::vector<paddle::experimental::Tensor> outs = {out};
   // 4. Run Backward
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 2.0);
@@ -405,7 +412,7 @@ TEST(FwdBwdJoint, BranchedNodesCUDA) {
   // TODO(jiabin): fix this with add functor
   // 4. Run Backward
   std::vector<paddle::experimental::Tensor> outs = {out1, out2};
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc
index 4b7077b13bdd6..49e517dc9b3f3 100644
--- a/paddle/fluid/eager/tests/task_tests/generated_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc
@@ -30,6 +30,12 @@
 #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 TEST(Generated, Sigmoid) {
@@ -51,7 +57,7 @@ TEST(Generated, Sigmoid) {
 
   std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
   VLOG(6) << "Runing Backward";
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   VLOG(6) << "Finish Backward";
   eager_test::CompareGradTensorWithValue<float>(tensor, 0.25);
@@ -83,7 +89,7 @@ TEST(Generated, Matmul_v2) {
   eager_test::CompareTensorWithValue<float>(output_tensor, 96);
 
   std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(X, 2.0 * 20);
   eager_test::CompareGradTensorWithValue<float>(Y, 3.0 * 4);
@@ -114,7 +120,7 @@ TEST(Generated, ElementwiseAdd) {
   eager_test::CompareTensorWithValue<float>(output_tensor, 5);
 
   std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(X, 1.0);
   eager_test::CompareGradTensorWithValue<float>(Y, 1.0);
@@ -122,6 +128,6 @@ TEST(Generated, ElementwiseAdd) {
 
 }  // namespace egr
 
-USE_OP(sigmoid);
+USE_OP_ITSELF(sigmoid);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(matmul_v2);
diff --git a/paddle/fluid/eager/tests/task_tests/grad_test.cc b/paddle/fluid/eager/tests/task_tests/grad_test.cc
new file mode 100644
index 0000000000000..6b03799c48659
--- /dev/null
+++ b/paddle/fluid/eager/tests/task_tests/grad_test.cc
@@ -0,0 +1,339 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/backward.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/tests/test_utils.h"
+
+#include "paddle/fluid/eager/api/all.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+namespace egr {
+
+TEST(Grad, SingleNodeEmptyGrad) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Inputs
+  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor (output)
+  paddle::experimental::Tensor output_tensor =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+
+  // Create input tensor
+  const paddle::experimental::Tensor leaf_tensor =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
+
+  {
+    // Create Scale Node
+    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node0_ptr->SetAttributes_scale(5.0 /*scale*/);
+
+    // Set grad in/out meta
+    node0_ptr->SetDefaultGradInOutMeta();
+
+    // Output_tensor set GradNode、OutRank、StopGradient propertis
+    AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&output_tensor);
+    auto_grad_meta->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
+    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta->SetStopGradient(false);
+
+    // Get autograd_meta from input tensor
+    AutogradMeta* auto_grad_meta1 =
+        EagerUtils::unsafe_autograd_meta(leaf_tensor);
+
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
+
+    // input tensor set GradNode、OutRank、StopGradient propertis
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta1->SetStopGradient(false);
+
+    // grad_node Add Edges
+    std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
+    node0_ptr->AddEdges(&res, 0);
+  }
+  std::vector<paddle::experimental::Tensor> outs = {output_tensor};
+
+  // Run Grad
+  auto result = Grad(outs, {leaf_tensor}, {});
+  // Check Output Value
+  eager_test::CompareTensorWithValue<float>(result[0], 5.0);
+}
+
+TEST(Grad, SingleNodeCustomGrad) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Inputs
+  std::vector<paddle::experimental::Tensor> target_tensors;
+  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor
+  paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(tensor));
+
+  std::vector<paddle::experimental::Tensor> grad_tensors;
+  // Create Grad Tensor
+  paddle::experimental::Tensor grad_tensor =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
+  grad_tensors.emplace_back(std::move(grad_tensor));
+
+  paddle::experimental::Tensor leaf_tensor =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
+
+  {
+    // Create Scale Node
+    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node0_ptr->SetAttributes_scale(5.0 /*scale*/);
+
+    // Set grad in/out meta
+    node0_ptr->SetDefaultGradInOutMeta();
+
+    // Connect Tensor and Node via AutoGradMeta
+    AutogradMeta* auto_grad_meta =
+        EagerUtils::autograd_meta(&(target_tensors[0]));
+    auto_grad_meta->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
+    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta->SetStopGradient(false);
+
+    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
+
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta1->SetStopGradient(false);
+    std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
+    node0_ptr->AddEdges(&res, 0);
+  }
+
+  auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors);
+
+  // Check Output Value
+  eager_test::CompareTensorWithValue<float>(result[0], 50.0);
+}
+
+/*
+Node1
+  |
+Node0
+  |
+ { } // empty grad tensor
+*/
+TEST(Grad, LinearNodes) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Target Tensor
+  std::vector<paddle::experimental::Tensor> target_tensors;
+  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor
+  paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(tensor));
+
+  paddle::experimental::Tensor leaf_tensor =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
+  {
+    // Create Node0
+    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node0_ptr->SetAttributes_scale(5.0 /*scale*/);
+
+    // Set grad in/out meta for node0
+    node0_ptr->SetDefaultGradInOutMeta();
+
+    // Create Node1
+    auto node1_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node1_ptr->SetAttributes_scale(10.0 /*scale*/);
+
+    // Set grad in/out meta for node1
+    node1_ptr->SetDefaultGradInOutMeta();
+
+    // Connect Input Tensor and Node0 via AutoGradMeta
+    AutogradMeta* auto_grad_meta =
+        EagerUtils::autograd_meta(&(target_tensors[0]));
+    auto_grad_meta->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
+    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta->SetStopGradient(false);
+    // Connect Node0 -> Node1 via Edge
+    auto meta0 = egr::AutogradMeta();
+    meta0.SetStopGradient(false);
+    meta0.SetSingleOutRankWithSlot(0, 0);
+    meta0.SetGradNode(node1_ptr);
+    std::vector<egr::AutogradMeta*> res0 = {&meta0};
+    node0_ptr->AddEdges(&res0, 0);
+
+    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
+
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+
+    auto_grad_meta1->SetStopGradient(false);
+    std::vector<egr::AutogradMeta*> res1 = {auto_grad_meta1};
+    node1_ptr->AddEdges(&res1, 0);
+  }
+
+  // Use Empty Grad Tensor
+  auto result = Grad(target_tensors, {leaf_tensor}, {});
+
+  // Check Output Value
+  eager_test::CompareTensorWithValue<float>(result[0], 50.0);
+}
+
+/*
+    Node2
+    |   |
+Node0   Node1
+  |      |
+ in0   in1
+*/
+TEST(Grad, WithAccumulation) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Inputs
+  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor
+  std::vector<paddle::experimental::Tensor> target_tensors;
+  paddle::experimental::Tensor tensor0 = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  paddle::experimental::Tensor tensor1 = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(tensor0));
+  target_tensors.emplace_back(std::move(tensor1));
+
+  // Create Grad Tensor
+  std::vector<paddle::experimental::Tensor> grad_tensors;
+  paddle::experimental::Tensor grad_tensor0 =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
+  paddle::experimental::Tensor grad_tensor1 =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
+  grad_tensors.emplace_back(std::move(grad_tensor0));
+  grad_tensors.emplace_back(std::move(grad_tensor1));
+
+  paddle::experimental::Tensor leaf_tensor;
+  {
+    // Create Node0
+    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node0_ptr->SetAttributes_scale(5.0 /*scale*/);
+    node0_ptr->SetDefaultGradInOutMeta();
+
+    // Create Node1
+    auto node1_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node1_ptr->SetAttributes_scale(10.0 /*scale*/);
+    node1_ptr->SetDefaultGradInOutMeta();
+    // Create Node2
+    auto node2_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node2_ptr->SetAttributes_scale(20.0 /*scale*/);
+    node2_ptr->SetDefaultGradInOutMeta();
+    // Connect Inp0 and Node0 via AutoGradMeta
+    AutogradMeta* auto_grad_meta0 =
+        EagerUtils::autograd_meta(&(target_tensors[0]));
+    auto_grad_meta0->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
+    auto_grad_meta0->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta0->SetStopGradient(false);
+    // Connect Inp1 and Node1 via AutoGradMeta
+    AutogradMeta* auto_grad_meta1 =
+        EagerUtils::autograd_meta(&(target_tensors[1]));
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node1_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta1->SetStopGradient(false);
+
+    // Connect Node0 -> Node2 via Edge
+    auto meta0 = egr::AutogradMeta();
+    meta0.SetStopGradient(false);
+    meta0.SetSingleOutRankWithSlot(0, 0);
+    meta0.SetGradNode(node2_ptr);
+    std::vector<egr::AutogradMeta*> res0 = {&meta0};
+    node0_ptr->AddEdges(&res0, 0);
+
+    // Connect Node1 -> Node2 via Edge
+    auto meta1 = egr::AutogradMeta();
+    meta1.SetStopGradient(false);
+    meta1.SetSingleOutRankWithSlot(0, 0);
+    meta1.SetGradNode(node2_ptr);
+    std::vector<egr::AutogradMeta*> res1 = {&meta1};
+    node1_ptr->AddEdges(&res1, 0);
+
+    AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor);
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta2);
+
+    auto_grad_meta2->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta2->SetSingleOutRankWithSlot(0, 0);
+
+    auto_grad_meta2->SetStopGradient(false);
+    std::vector<egr::AutogradMeta*> res2 = {auto_grad_meta2};
+    node2_ptr->AddEdges(&res2, 0);
+  }
+
+  auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors);
+
+  eager_test::CompareTensorWithValue<float>(result[0], 2500.0);
+}
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index 9cda961741f55..2c53fc89f650e 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -31,6 +31,10 @@
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 paddle::experimental::Tensor hook_function(
@@ -128,7 +132,7 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
         leaf_tensor);  // result: 4.0*5.0 + 3.0 = 23.0
   }
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(target_tensor, 4.0);
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
@@ -195,7 +199,7 @@ TEST(RetainGrad, HookAfterRetainGrad) {
         leaf_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
   }
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
   eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
 }
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
index 15b2a62dca751..b86865e2d126f 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
@@ -27,6 +27,12 @@
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 paddle::experimental::Tensor hook_function(
@@ -102,7 +108,7 @@ void test_sigmoid(bool is_remove_gradient_hook) {
   }
 
   VLOG(6) << "Runing Backward";
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
   VLOG(6) << "Finish Backward";
 
   eager_test::CompareGradTensorWithValue<float>(
@@ -160,7 +166,7 @@ void test_elementwiseAdd(bool is_remove_gradient_hook) {
     grad_node_tmp->RemoveGradientHook(hook_id);
   }
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(X, 1.0);
   eager_test::CompareGradTensorWithValue<float>(
@@ -218,7 +224,7 @@ void test_matmul(bool is_remove_gradient_hook) {
     grad_node_tmp->RemoveGradientHook(hook_id);
   }
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(X, 2.0 * 20);
   eager_test::CompareGradTensorWithValue<float>(
@@ -249,6 +255,6 @@ TEST(Hook_intermidiate, Matmul_v2) {
 }
 }  // namespace egr
 
-USE_OP(sigmoid);
+USE_OP_ITSELF(sigmoid);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(matmul_v2);
diff --git a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
index ea821d195099f..24e5da060111f 100644
--- a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
@@ -23,6 +23,10 @@
 #include "paddle/fluid/eager/tests/test_utils.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 TEST(TensorUtils, Test) {
diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h
index 6f8bccd64e45f..277319bc700b6 100644
--- a/paddle/fluid/eager/to_static/run_program_op_func.h
+++ b/paddle/fluid/eager/to_static/run_program_op_func.h
@@ -57,6 +57,7 @@ inline void run_program_dygraph_function(
     auto grad_node = std::make_shared<GradNodeRunProgram>(1, 2);
 
     grad_node->SetFwdOutNames(out_names);
+    grad_node->SetOut(out);
     // Set Attributes
     grad_node->SetAttrMap(attrs);
     // Set TensorWrappers
@@ -65,10 +66,10 @@ inline void run_program_dygraph_function(
     grad_node->SetStepScope(step_scope);
 
     // Set Grad out rank as same as fwd input and set stop gradient to bwd
-    grad_node->SetGradOutMeta(&p_autograd_x, /*slot id*/ 0);
-    grad_node->SetGradOutMeta(&p_autograd_params, /*slot id*/ 1);
+    grad_node->SetGradOutMeta(x, /*slot id*/ 0);
+    grad_node->SetGradOutMeta(params, /*slot id*/ 1);
 
-    grad_node->SetGradInMeta(&p_autograd_outs, 0);
+    grad_node->SetGradInMeta(deref_out, 0);
     // Set Next Edges
     grad_node->AddEdges(&p_autograd_x, /*slot id*/ 0);
     grad_node->AddEdges(&p_autograd_params, /*slot id*/ 1);
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index ae5d86664a346..4eaa64d3ac659 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -260,9 +260,9 @@ inline void RunProgramAPI(
   }
   VLOG(2) << "The number of sub scopes after forward: "
           << out_scope_vec->front()->kids().size();
-  // #ifdef PADDLE_WITH_MKLDNN
-  //     if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place);
-  // #endif
+#ifdef PADDLE_WITH_MKLDNN
+  if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place);
+#endif
 }
 
 inline void RunProgramGradAPI(
@@ -357,7 +357,7 @@ inline void RunProgramGradAPI(
   details::ShareTensorsFromScope(params_grad, *global_block, &scope);
 
   // Step5. drop current scope
-  // global_inner_scope->DeleteScope(&scope);
+  global_inner_scope->DeleteScope(&scope);
   VLOG(2) << "The number of sub scopes after backward: "
           << global_inner_scope->kids().size();
 }
@@ -370,8 +370,8 @@ class GradNodeRunProgram : public egr::GradNodeBase {
   ~GradNodeRunProgram() override = default;
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>> &grads)
-      override {
+      const std::vector<std::vector<paddle::experimental::Tensor>> &grads,
+      bool create_graph) override {
     VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
     PADDLE_ENFORCE_EQ(
         grads.size(), 1,
@@ -400,6 +400,10 @@ class GradNodeRunProgram : public egr::GradNodeBase {
         paddle::platform::errors::InvalidArgument(
             "The grads[0].size() and fwd_out_names_.size() should be equal."));
     for (size_t i = 0; i < fwd_out_names_.size(); ++i) {
+      auto &out_grad = egr::EagerUtils::unsafe_autograd_meta(*out_[i])->Grad();
+      const_cast<paddle::experimental::Tensor &>(out_grad).set_impl(
+          grads[0][i].impl());
+
       const_cast<paddle::experimental::Tensor &>(grads[0][i])
           .set_name(fwd_out_names_[i] + "@GRAD");
     }
@@ -411,6 +415,12 @@ class GradNodeRunProgram : public egr::GradNodeBase {
     // return {x_grad, details::DereferenceTensors(params_grad_ptr)};
   }
 
+  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+  bool IsTensorWrappersCleared() override {
+    VLOG(6) << "Do nothing here now";
+    return false;
+  }
+
   // SetAttrMap
   void SetAttrMap(const paddle::framework::AttributeMap &attrs) {
     attrs_ = attrs;
@@ -432,6 +442,10 @@ class GradNodeRunProgram : public egr::GradNodeBase {
     fwd_out_names_ = out_names;
   }
 
+  void SetOut(const std::vector<paddle::experimental::Tensor *> &out) {
+    out_ = out;
+  }
+
  protected:
   void ConstructGradTensors(
       const std::vector<paddle::experimental::Tensor> &fwd_tensors,
@@ -440,7 +454,11 @@ class GradNodeRunProgram : public egr::GradNodeBase {
     // such as: name, tensor type(DenseTensor or SelectedRows).
     VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size();
     for (auto &fwd_t : fwd_tensors) {
-      grad_tensors->emplace_back(fwd_t.impl());
+      if (phi::DenseTensor::classof(fwd_t.impl().get())) {
+        grad_tensors->emplace_back(std::make_shared<phi::DenseTensor>());
+      } else if (phi::SelectedRows::classof(fwd_t.impl().get())) {
+        grad_tensors->emplace_back(std::make_shared<phi::SelectedRows>());
+      }
       auto &grad_t = grad_tensors->back();
       grad_t.set_name(fwd_t.name() + "@GRAD");
     }
@@ -462,6 +480,7 @@ class GradNodeRunProgram : public egr::GradNodeBase {
   std::vector<paddle::framework::Scope *> step_scope_;
 
   std::vector<std::string> fwd_out_names_;
+  std::vector<paddle::experimental::Tensor *> out_;
 
   // Attribute Map
   paddle::framework::AttributeMap attrs_;
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 8a57d2694535e..048087903a47c 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -212,6 +212,27 @@ std::vector<std::shared_ptr<EagerVariable>> EagerUtils::CreateVars(
   return res;
 }
 
+void EagerUtils::ModifyInplaceInput(
+    const std::shared_ptr<EagerVariable>& inplace_variable,
+    paddle::experimental::Tensor* inplace_tensor) {
+  // Only modify the meta information of the inplace tensor, because
+  // EagerVariable cannot modify Tensor's meta information after inplace
+  // op (such as ``reshape``) is executed.
+  PADDLE_ENFORCE_NOT_NULL(inplace_tensor,
+                          paddle::platform::errors::Fatal(
+                              "Inplace Tensor is null and cannot be modified. "
+                              "We are tring to Modify Inplace Input from its "
+                              "shared_ptr, this error may indicate the inplace "
+                              " input is nullptr"));
+  if (phi::DenseTensor::classof(inplace_variable->GetTensorBase().get())) {
+    phi::DenseTensor* variable_dense_tensor =
+        static_cast<phi::DenseTensor*>(inplace_variable->GetTensorBase().get());
+    phi::DenseTensor* tensor_dense_tensor =
+        static_cast<phi::DenseTensor*>(inplace_tensor->impl().get());
+    tensor_dense_tensor->set_meta(variable_dense_tensor->meta());
+  }
+}
+
 std::vector<paddle::experimental::Tensor> EagerUtils::GetOutputs(
     const std::vector<std::shared_ptr<EagerVariable>>& outs) {
   std::vector<paddle::experimental::Tensor> res;
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index fa5735e6f32a0..fbd080ef70e25 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
@@ -144,6 +145,19 @@ class EagerUtils {
     iter.apply(std::forward<Args>(args)...);
   }
 
+  static void CheckInplace(const paddle::experimental::Tensor& target,
+                           const AutogradMeta* autograd_meta,
+                           bool require_any_grad) {
+    if (require_any_grad && autograd_meta) {
+      PADDLE_ENFORCE_EQ(!autograd_meta->StopGradient() &&
+                            egr::egr_utils_api::IsLeafTensor(target),
+                        false, paddle::platform::errors::InvalidArgument(
+                                   "Leaf Var (%s) that doesn't stop gradient "
+                                   "can't use inplace strategy.",
+                                   target.name()));
+    }
+  }
+
   // TensorWrapper Utils
   static paddle::experimental::Tensor RecoverTensorWrapper(
       TensorWrapper* tw, const std::shared_ptr<GradNodeBase>& grad_node);
@@ -171,6 +185,9 @@ class EagerUtils {
   static std::vector<std::shared_ptr<EagerVariable>> CreateVars(
       const size_t num);
   // Construct Tensor From var
+  static void ModifyInplaceInput(
+      const std::shared_ptr<EagerVariable>& inplace_variable,
+      paddle::experimental::Tensor* inplace_tensor);
   static std::vector<paddle::experimental::Tensor> GetOutputs(
       const std::vector<std::shared_ptr<EagerVariable>>& outs);
   static paddle::experimental::Tensor GetOutput(
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index aa92a3b2226c1..5dc3d9e89c557 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -440,6 +440,7 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
 configure_file(commit.h.in commit.h)
 
 cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper phi_tensor op_meta_info phi_api)
+
 #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
 #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index b9e3bee25f6b5..478e39b99dcc9 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/op_meta_info_helper.h"
@@ -946,15 +947,16 @@ void RegisterOperatorWithMetaInfoMap(
 ////////////////////// User APIs ///////////////////////
 
 // load op api
-void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) {
+const std::unordered_map<std::string, std::vector<OpMetaInfo>>&
+LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) {
   void* handle = paddle::platform::dynload::GetOpDsoHandle(dso_name);
   VLOG(3) << "load custom_op lib: " << dso_name;
   typedef OpMetaInfoMap& get_op_meta_info_map_t();
   auto* get_op_meta_info_map =
       detail::DynLoad<get_op_meta_info_map_t>(handle, "PD_GetOpMetaInfoMap");
   auto& op_meta_info_map = get_op_meta_info_map();
-
   RegisterOperatorWithMetaInfoMap(op_meta_info_map, handle);
+  return op_meta_info_map.GetMap();
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/custom_operator.h b/paddle/fluid/framework/custom_operator.h
index 4310b56437182..fef1e82a14fe6 100644
--- a/paddle/fluid/framework/custom_operator.h
+++ b/paddle/fluid/framework/custom_operator.h
@@ -20,9 +20,9 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-
 // Load custom op api: register op after user compiled
-void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name);
+const std::unordered_map<std::string, std::vector<OpMetaInfo>>&
+LoadOpMetaInfoAndRegisterOp(const std::string& dso_name);
 
 // Register custom op api: register op directly
 void RegisterOperatorWithMetaInfoMap(
@@ -31,6 +31,5 @@ void RegisterOperatorWithMetaInfoMap(
 // Interface for selective register custom op.
 void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
                                   void* dso_handle = nullptr);
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
index 1a4f283f511da..589d09bf81c1d 100644
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -34,6 +34,14 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
     return;
   }
 
+  // NOTE(hqp): Special case for CPU->MLU, avoid stream sync.
+  if (platform::is_cpu_place(in.place()) && platform::is_mlu_place(dst_place)) {
+    paddle::framework::TensorCopy(
+        in, dst_place, *platform::DeviceContextPool::Instance().Get(dst_place),
+        out);
+    return;
+  }
+
   // NOTE(yy): TransDataDevice should wait for computation of input.
   if (!platform::is_cuda_pinned_place(in.place())) {
     platform::DeviceContextPool::Instance().Get(in.place())->Wait();
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 48850d4624a14..f951b5d0f5070 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -174,10 +174,11 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                    bool force_disable_gc, bool keep_kid_scopes) {
   platform::RecordBlock b(block_id);
   if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
+  auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc);
 #ifdef PADDLE_WITH_MKLDNN
   platform::AttachPointerHashToMKLDNNKey(this, place_);
+  platform::RegisterModelLayout(ctx->ops_, place_);
 #endif
-  auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc);
   RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars,
                      keep_kid_scopes);
 }
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index 17346f5fd9393..2b8b4b3ff9573 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -10,8 +10,9 @@ IF(WITH_GPU)
     nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
     nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
     nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
-    nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm)
+    nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table)
     nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps)
+    nv_test(test_cpu_graph_sample SRCS test_cpu_graph_sample.cu DEPS graph_gpu_ps)
 ENDIF()
 IF(WITH_ROCM)
     hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
new file mode 100644
index 0000000000000..235f7a226ad17
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
@@ -0,0 +1,120 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef PADDLE_WITH_HETERPS
+namespace paddle {
+namespace framework {
+struct GpuPsGraphNode {
+  int64_t node_id;
+  int neighbor_size, neighbor_offset;
+  // this node's neighbor is stored on [neighbor_offset,neighbor_offset +
+  // neighbor_size) of int64_t *neighbor_list;
+};
+
+struct GpuPsCommGraph {
+  int64_t *neighbor_list;
+  GpuPsGraphNode *node_list;
+  int neighbor_size, node_size;
+  // the size of neighbor array and graph_node_list array
+  GpuPsCommGraph()
+      : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {}
+  GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_,
+                 int neighbor_size_, int node_size_)
+      : neighbor_list(neighbor_list_),
+        node_list(node_list_),
+        neighbor_size(neighbor_size_),
+        node_size(node_size_) {}
+};
+
+/*
+suppose we have a graph like this
+
+0----3-----5----7
+ \   |\         |\
+ 17  8 9        1 2
+
+we save the nodes in arbitrary order,
+in this example,the order is
+[0,5,1,2,7,3,8,9,17]
+let us name this array u_id;
+we record each node's neighbors:
+0:3,17
+5:3,7
+1:7
+2:7
+7:1,2,5
+3:0,5,8,9
+8:3
+9:3
+17:0
+
+by concatenating each node's neighbor_list in the order we save the node id.
+we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0]
+this is the neighbor_list of GpuPsCommGraph
+given this neighbor_list and the order to save node id,
+we know,
+node 0's neighbors are in the range [0,1] of neighbor_list
+node 5's neighbors are in the range [2,3] of neighbor_list
+node 1's neighbors are in the range [4,4] of neighbor_list
+node 2:[5,5]
+node 7:[6,6]
+node 3:[9,12]
+node 8:[13,13]
+node 9:[14,14]
+node 17:[15,15]
+...
+by the above information,
+we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph
+of size 9,
+where node_list[i].id = u_id[i]
+then we have:
+node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0
+node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2
+node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4
+node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5
+node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6
+node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9
+node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13
+node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14
+node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
+*/
+struct NeighborSampleResult {
+  int64_t *val;
+  int *actual_sample_size, sample_size, key_size;
+  NeighborSampleResult(int _sample_size, int _key_size)
+      : sample_size(_sample_size), key_size(_key_size) {
+    actual_sample_size = NULL;
+    val = NULL;
+  };
+  ~NeighborSampleResult() {
+    if (val != NULL) cudaFree(val);
+    if (actual_sample_size != NULL) cudaFree(actual_sample_size);
+  }
+};
+
+struct NodeQueryResult {
+  int64_t *val;
+  int actual_sample_size;
+  NodeQueryResult() {
+    val = NULL;
+    actual_sample_size = 0;
+  };
+  ~NodeQueryResult() {
+    if (val != NULL) cudaFree(val);
+  }
+};
+}
+};
+#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
index a6508bf96c00f..3d1599a76e8eb 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -13,115 +13,27 @@
 // limitations under the License.
 
 #pragma once
+#include <thrust/host_vector.h>
 #include "heter_comm.h"
+#include "paddle/fluid/distributed/ps/table/common_graph_table.h"
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
 #include "paddle/fluid/platform/enforce.h"
 #ifdef PADDLE_WITH_HETERPS
 namespace paddle {
 namespace framework {
-struct GpuPsGraphNode {
-  int64_t node_id;
-  int neighbor_size, neighbor_offset;
-  // this node's neighbor is stored on [neighbor_offset,neighbor_offset +
-  // neighbor_size) of int64_t *neighbor_list;
-};
-
-struct GpuPsCommGraph {
-  int64_t *neighbor_list;
-  GpuPsGraphNode *node_list;
-  int neighbor_size, node_size;
-  // the size of neighbor array and graph_node_list array
-  GpuPsCommGraph()
-      : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {}
-  GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_,
-                 int neighbor_size_, int node_size_)
-      : neighbor_list(neighbor_list_),
-        node_list(node_list_),
-        neighbor_size(neighbor_size_),
-        node_size(node_size_) {}
-};
-
-/*
-suppose we have a graph like this
 
-0----3-----5----7
- \   |\         |\
- 17  8 9        1 2
-
-we save the nodes in arbitrary order,
-in this example,the order is
-[0,5,1,2,7,3,8,9,17]
-let us name this array u_id;
-we record each node's neighbors:
-0:3,17
-5:3,7
-1:7
-2:7
-7:1,2,5
-3:0,5,8,9
-8:3
-9:3
-17:0
-
-by concatenating each node's neighbor_list in the order we save the node id.
-we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0]
-this is the neighbor_list of GpuPsCommGraph
-given this neighbor_list and the order to save node id,
-we know,
-node 0's neighbors are in the range [0,1] of neighbor_list
-node 5's neighbors are in the range [2,3] of neighbor_list
-node 1's neighbors are in the range [4,4] of neighbor_list
-node 2:[5,5]
-node 7:[6,6]
-node 3:[9,12]
-node 8:[13,13]
-node 9:[14,14]
-node 17:[15,15]
-...
-by the above information,
-we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph
-of size 9,
-where node_list[i].id = u_id[i]
-then we have:
-node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0
-node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2
-node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4
-node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5
-node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6
-node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9
-node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13
-node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14
-node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
-*/
-struct NeighborSampleResult {
-  int64_t *val;
-  int *actual_sample_size, sample_size, key_size;
-  NeighborSampleResult(int _sample_size, int _key_size)
-      : sample_size(_sample_size), key_size(_key_size) {
-    actual_sample_size = NULL;
-    val = NULL;
-  };
-  ~NeighborSampleResult() {
-    if (val != NULL) cudaFree(val);
-    if (actual_sample_size != NULL) cudaFree(actual_sample_size);
-  }
-};
-
-struct NodeQueryResult {
-  int64_t *val;
-  int actual_sample_size;
-  NodeQueryResult() {
-    val = NULL;
-    actual_sample_size = 0;
-  };
-  ~NodeQueryResult() {
-    if (val != NULL) cudaFree(val);
-  }
-};
 class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
  public:
   GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource)
       : HeterComm<int64_t, int, int>(1, resource) {
     load_factor_ = 0.25;
+    rw_lock.reset(new pthread_rwlock_t());
+    cpu_table_status = -1;
+  }
+  ~GpuPsGraphTable() {
+    if (cpu_table_status != -1) {
+      end_graph_sampling();
+    }
   }
   void build_graph_from_cpu(std::vector<GpuPsCommGraph> &cpu_node_list);
   NodeQueryResult *graph_node_sample(int gpu_id, int sample_size);
@@ -129,14 +41,26 @@ class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
                                               int sample_size, int len);
   NodeQueryResult *query_node_list(int gpu_id, int start, int query_size);
   void clear_graph_info();
-  void move_neighbor_sample_result_to_source_gpu(int gpu_id, int gpu_num,
-                                                 int sample_size, int *h_left,
-                                                 int *h_right,
-                                                 int64_t *src_sample_res,
-                                                 int *actual_sample_size);
+  void move_neighbor_sample_result_to_source_gpu(
+      int gpu_id, int gpu_num, int *h_left, int *h_right,
+      int64_t *src_sample_res, thrust::host_vector<int> &total_sample_size);
+  void move_neighbor_sample_size_to_source_gpu(int gpu_id, int gpu_num,
+                                               int *h_left, int *h_right,
+                                               int *actual_sample_size,
+                                               int *total_sample_size);
+  int init_cpu_table(const paddle::distributed::GraphParameter &graph);
+  int load(const std::string &path, const std::string &param);
+  virtual int32_t end_graph_sampling() {
+    return cpu_graph_table->end_graph_sampling();
+  }
 
  private:
   std::vector<GpuPsCommGraph> gpu_graph_list;
+  std::shared_ptr<paddle::distributed::GraphTable> cpu_graph_table;
+  std::shared_ptr<pthread_rwlock_t> rw_lock;
+  mutable std::mutex mutex_;
+  std::condition_variable cv_;
+  int cpu_table_status;
 };
 }
 };
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
index 839c7e5468c6c..acd3f0a290d0b 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
@@ -13,9 +13,23 @@
 // limitations under the License.
 
 #pragma once
+
+#include <cuda_runtime.h>
+#include <curand_kernel.h>
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
 #ifdef PADDLE_WITH_HETERPS
+//#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
 namespace paddle {
 namespace framework {
+
+constexpr int WARP_SIZE = 32;
+
 /*
 comment 0
 this kernel just serves as an example of how to sample nodes' neighbors.
@@ -28,30 +42,116 @@ sample_size;
 
 */
 
-__global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index,
-                                        int* actual_size,
-                                        int64_t* sample_result, int sample_size,
-                                        int len) {
-  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < len) {
+struct MaxFunctor {
+  int sample_size;
+  HOSTDEVICE explicit inline MaxFunctor(int sample_size) {
+    this->sample_size = sample_size;
+  }
+  HOSTDEVICE inline int operator()(int x) const {
+    if (x > sample_size) {
+      return sample_size;
+    }
+    return x;
+  }
+};
+
+struct DegreeFunctor {
+  GpuPsCommGraph graph;
+  HOSTDEVICE explicit inline DegreeFunctor(GpuPsCommGraph graph) {
+    this->graph = graph;
+  }
+  HOSTDEVICE inline int operator()(int i) const {
+    return graph.node_list[i].neighbor_size;
+  }
+};
+
+template <int BLOCK_WARPS, int TILE_SIZE>
+__global__ void neighbor_sample(const uint64_t rand_seed, GpuPsCommGraph graph,
+                                int sample_size, int* index, int len,
+                                int64_t* sample_result, int* output_idx,
+                                int* output_offset) {
+  assert(blockDim.x == WARP_SIZE);
+  assert(blockDim.y == BLOCK_WARPS);
+
+  int i = blockIdx.x * TILE_SIZE + threadIdx.y;
+  const int last_idx = min(static_cast<int>(blockIdx.x + 1) * TILE_SIZE, len);
+  curandState rng;
+  curand_init(rand_seed * gridDim.x + blockIdx.x,
+              threadIdx.y * WARP_SIZE + threadIdx.x, 0, &rng);
+
+  while (i < last_idx) {
     auto node_index = index[i];
-    actual_size[i] = graph.node_list[node_index].neighbor_size < sample_size
-                         ? graph.node_list[node_index].neighbor_size
-                         : sample_size;
-    int offset = graph.node_list[node_index].neighbor_offset;
-    for (int j = 0; j < actual_size[i]; j++) {
-      sample_result[sample_size * i + j] = graph.neighbor_list[offset + j];
+    int degree = graph.node_list[node_index].neighbor_size;
+    const int offset = graph.node_list[node_index].neighbor_offset;
+    int output_start = output_offset[i];
+
+    if (degree <= sample_size) {
+      // Just copy
+      for (int j = threadIdx.x; j < degree; j += WARP_SIZE) {
+        sample_result[output_start + j] = graph.neighbor_list[offset + j];
+      }
+    } else {
+      for (int j = threadIdx.x; j < degree; j += WARP_SIZE) {
+        output_idx[output_start + j] = j;
+      }
+
+      __syncwarp();
+
+      for (int j = sample_size + threadIdx.x; j < degree; j += WARP_SIZE) {
+        const int num = curand(&rng) % (j + 1);
+        if (num < sample_size) {
+          atomicMax(
+              reinterpret_cast<unsigned int*>(output_idx + output_start + num),
+              static_cast<unsigned int>(j));
+        }
+      }
+
+      __syncwarp();
+
+      for (int j = threadIdx.x; j < sample_size; j += WARP_SIZE) {
+        const int perm_idx = output_idx[output_start + j] + offset;
+        sample_result[output_start + j] = graph.neighbor_list[perm_idx];
+      }
     }
+
+    i += BLOCK_WARPS;
   }
 }
 
+int GpuPsGraphTable::init_cpu_table(
+    const paddle::distributed::GraphParameter& graph) {
+  cpu_graph_table.reset(new paddle::distributed::GraphTable);
+  cpu_table_status = cpu_graph_table->initialize(graph);
+  if (cpu_table_status != 0) return cpu_table_status;
+  std::function<void(std::vector<GpuPsCommGraph>&)> callback =
+      [this](std::vector<GpuPsCommGraph>& res) {
+        pthread_rwlock_wrlock(this->rw_lock.get());
+        this->clear_graph_info();
+        this->build_graph_from_cpu(res);
+        pthread_rwlock_unlock(this->rw_lock.get());
+        cv_.notify_one();
+      };
+  cpu_graph_table->set_graph_sample_callback(callback);
+  return cpu_table_status;
+}
+
+int GpuPsGraphTable::load(const std::string& path, const std::string& param) {
+  int status = cpu_graph_table->load(path, param);
+  if (status != 0) {
+    return status;
+  }
+  std::unique_lock<std::mutex> lock(mutex_);
+  cpu_graph_table->start_graph_sampling();
+  cv_.wait(lock);
+  return 0;
+}
 /*
  comment 1
 
  gpu i triggers a neighbor_sample task,
  when this task is done,
  this function is called to move the sample result on other gpu back
- to gup i and aggragate the result.
+ to gpu i and aggragate the result.
  the sample_result is saved on src_sample_res and the actual sample size for
  each node is saved on actual_sample_size.
  the number of actual sample_result for
@@ -68,9 +168,50 @@ __global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index,
  that's what fill_dvals does.
 
 */
+void GpuPsGraphTable::move_neighbor_sample_size_to_source_gpu(
+    int gpu_id, int gpu_num, int* h_left, int* h_right, int* actual_sample_size,
+    int* total_sample_size) {
+  // This function copyed actual_sample_size to source_gpu,
+  // and calculate total_sample_size of each gpu sample number.
+  for (int i = 0; i < gpu_num; i++) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
+    auto shard_len = h_right[i] - h_left[i] + 1;
+    auto& node = path_[gpu_id][i].nodes_.front();
+    cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
+                    node.val_storage + sizeof(int) * shard_len,
+                    sizeof(int) * shard_len, cudaMemcpyDefault,
+                    node.out_stream);
+  }
+  for (int i = 0; i < gpu_num; ++i) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      total_sample_size[i] = 0;
+      continue;
+    }
+    auto& node = path_[gpu_id][i].nodes_.front();
+    cudaStreamSynchronize(node.out_stream);
+
+    auto shard_len = h_right[i] - h_left[i] + 1;
+    thrust::device_vector<int> t_actual_sample_size(shard_len);
+    thrust::copy(actual_sample_size + h_left[i],
+                 actual_sample_size + h_left[i] + shard_len,
+                 t_actual_sample_size.begin());
+    total_sample_size[i] = thrust::reduce(t_actual_sample_size.begin(),
+                                          t_actual_sample_size.end());
+  }
+}
+
 void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
-    int gpu_id, int gpu_num, int sample_size, int* h_left, int* h_right,
-    int64_t* src_sample_res, int* actual_sample_size) {
+    int gpu_id, int gpu_num, int* h_left, int* h_right, int64_t* src_sample_res,
+    thrust::host_vector<int>& total_sample_size) {
+  /*
+  if total_sample_size is [4, 5, 1, 6],
+  then cumsum_total_sample_size is [0, 4, 9, 10];
+  */
+  thrust::host_vector<int> cumsum_total_sample_size(gpu_num, 0);
+  thrust::exclusive_scan(total_sample_size.begin(), total_sample_size.end(),
+                         cumsum_total_sample_size.begin(), 0);
   for (int i = 0; i < gpu_num; i++) {
     if (h_left[i] == -1 || h_right[i] == -1) {
       continue;
@@ -80,14 +221,10 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
     // auto& node = path_[gpu_id][i].nodes_[cur_step];
     auto& node = path_[gpu_id][i].nodes_.front();
     cudaMemcpyAsync(
-        reinterpret_cast<char*>(src_sample_res + h_left[i] * sample_size),
+        reinterpret_cast<char*>(src_sample_res + cumsum_total_sample_size[i]),
         node.val_storage + sizeof(int64_t) * shard_len,
-        node.val_bytes_len - sizeof(int64_t) * shard_len, cudaMemcpyDefault,
+        sizeof(int64_t) * total_sample_size[i], cudaMemcpyDefault,
         node.out_stream);
-    cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
-                    node.val_storage + sizeof(int) * shard_len,
-                    sizeof(int) * shard_len, cudaMemcpyDefault,
-                    node.out_stream);
   }
   for (int i = 0; i < gpu_num; ++i) {
     if (h_left[i] == -1 || h_right[i] == -1) {
@@ -102,17 +239,35 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
 TODO:
 how to optimize it to eliminate the for loop
 */
-__global__ void fill_dvalues(int64_t* d_shard_vals, int64_t* d_vals,
-                             int* d_shard_actual_sample_size,
-                             int* d_actual_sample_size, int* idx,
-                             int sample_size, int len) {
+__global__ void fill_dvalues_actual_sample_size(int* d_shard_actual_sample_size,
+                                                int* d_actual_sample_size,
+                                                int* idx, int len) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < len) {
     d_actual_sample_size[idx[i]] = d_shard_actual_sample_size[i];
-    // d_vals[idx[i]] = d_shard_vals[i];
-    for (int j = 0; j < sample_size; j++) {
-      d_vals[idx[i] * sample_size + j] = d_shard_vals[i * sample_size + j];
+  }
+}
+
+template <int BLOCK_WARPS, int TILE_SIZE>
+__global__ void fill_dvalues_sample_result(int64_t* d_shard_vals,
+                                           int64_t* d_vals,
+                                           int* d_actual_sample_size, int* idx,
+                                           int* offset, int* d_offset,
+                                           int len) {
+  assert(blockDim.x == WARP_SIZE);
+  assert(blockDim.y == BLOCK_WARPS);
+
+  int i = blockIdx.x * TILE_SIZE + threadIdx.y;
+  const int last_idx = min(static_cast<int>(blockIdx.x + 1) * TILE_SIZE, len);
+  while (i < last_idx) {
+    const int sample_size = d_actual_sample_size[idx[i]];
+    for (int j = threadIdx.x; j < sample_size; j += WARP_SIZE) {
+      d_vals[offset[idx[i]] + j] = d_shard_vals[d_offset[i] + j];
     }
+#ifdef PADDLE_WITH_CUDA
+    __syncwarp();
+#endif
+    i += BLOCK_WARPS;
   }
 }
 
@@ -226,14 +381,12 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
     h_left = [0,5],h_right = [4,8]
 
   */
+
   NeighborSampleResult* result = new NeighborSampleResult(sample_size, len);
   if (len == 0) {
     return result;
   }
-  cudaMalloc((void**)&result->val, len * sample_size * sizeof(int64_t));
-  cudaMalloc((void**)&result->actual_sample_size, len * sizeof(int));
-  int* actual_sample_size = result->actual_sample_size;
-  int64_t* val = result->val;
+
   int total_gpu = resource_->total_gpu();
   int dev_id = resource_->dev_id(gpu_id);
   platform::CUDAPlace place = platform::CUDAPlace(dev_id);
@@ -258,11 +411,6 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
 
   auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t));
   int64_t* d_shard_keys_ptr = reinterpret_cast<int64_t*>(d_shard_keys->ptr());
-  auto d_shard_vals = memory::Alloc(place, len * sizeof(int64_t));
-  int64_t* d_shard_vals_ptr = reinterpret_cast<int64_t*>(d_shard_vals->ptr());
-  auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
-  int* d_shard_actual_sample_size_ptr =
-      reinterpret_cast<int*>(d_shard_actual_sample_size->ptr());
 
   split_input_to_shard(key, d_idx_ptr, len, d_left_ptr, d_right_ptr, gpu_id);
 
@@ -302,6 +450,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
     of alloc_mem_i, actual_sample_size_of_x equals ((int
    *)alloc_mem_i)[shard_len + x]
     */
+
     create_storage(gpu_id, i, shard_len * sizeof(int64_t),
                    shard_len * (1 + sample_size) * sizeof(int64_t));
   }
@@ -322,6 +471,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
                     h_right[i] - h_left[i] + 1,
                     resource_->remote_stream(i, gpu_id));
   }
+
   for (int i = 0; i < total_gpu; ++i) {
     if (h_left[i] == -1) {
       continue;
@@ -335,10 +485,42 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
     int* res_array = reinterpret_cast<int*>(node.val_storage);
     int* actual_size_array = res_array + shard_len;
     int64_t* sample_array = (int64_t*)(res_array + shard_len * 2);
-    neighbor_sample_example<<<grid_size, block_size_, 0,
-                              resource_->remote_stream(i, gpu_id)>>>(
-        graph, res_array, actual_size_array, sample_array, sample_size,
-        shard_len);
+
+    // 1. get actual_size_array.
+    // 2. get sum of actual_size.
+    // 3. get offset ptr
+    thrust::device_vector<int> t_res_array(shard_len);
+    thrust::copy(res_array, res_array + shard_len, t_res_array.begin());
+    thrust::device_vector<int> t_actual_size_array(shard_len);
+    thrust::transform(t_res_array.begin(), t_res_array.end(),
+                      t_actual_size_array.begin(), DegreeFunctor(graph));
+
+    if (sample_size >= 0) {
+      thrust::transform(t_actual_size_array.begin(), t_actual_size_array.end(),
+                        t_actual_size_array.begin(), MaxFunctor(sample_size));
+    }
+
+    thrust::copy(t_actual_size_array.begin(), t_actual_size_array.end(),
+                 actual_size_array);
+
+    int total_sample_sum =
+        thrust::reduce(t_actual_size_array.begin(), t_actual_size_array.end());
+
+    thrust::device_vector<int> output_idx(total_sample_sum);
+    thrust::device_vector<int> output_offset(shard_len);
+    thrust::exclusive_scan(t_actual_size_array.begin(),
+                           t_actual_size_array.end(), output_offset.begin(), 0);
+
+    constexpr int BLOCK_WARPS = 128 / WARP_SIZE;
+    constexpr int TILE_SIZE = BLOCK_WARPS * 16;
+    const dim3 block_(WARP_SIZE, BLOCK_WARPS);
+    const dim3 grid_((shard_len + TILE_SIZE - 1) / TILE_SIZE);
+    neighbor_sample<
+        BLOCK_WARPS,
+        TILE_SIZE><<<grid_, block_, 0, resource_->remote_stream(i, gpu_id)>>>(
+        0, graph, sample_size, res_array, shard_len, sample_array,
+        thrust::raw_pointer_cast(output_idx.data()),
+        thrust::raw_pointer_cast(output_offset.data()));
   }
 
   for (int i = 0; i < total_gpu; ++i) {
@@ -349,13 +531,56 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
     tables_[i]->rwlock_->UNLock();
   }
   // walk_to_src(num, total_gpu, h_left, h_right, d_shard_vals_ptr);
-  move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, sample_size,
-                                            h_left, h_right, d_shard_vals_ptr,
-                                            d_shard_actual_sample_size_ptr);
 
-  fill_dvalues<<<grid_size, block_size_, 0, stream>>>(
-      d_shard_vals_ptr, val, d_shard_actual_sample_size_ptr, actual_sample_size,
-      d_idx_ptr, sample_size, len);
+  auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
+  int* d_shard_actual_sample_size_ptr =
+      reinterpret_cast<int*>(d_shard_actual_sample_size->ptr());
+  // Store total sample number of each gpu.
+  thrust::host_vector<int> d_shard_total_sample_size(total_gpu, 0);
+  move_neighbor_sample_size_to_source_gpu(
+      gpu_id, total_gpu, h_left, h_right, d_shard_actual_sample_size_ptr,
+      thrust::raw_pointer_cast(d_shard_total_sample_size.data()));
+  int allocate_sample_num = 0;
+  for (int i = 0; i < total_gpu; ++i) {
+    allocate_sample_num += d_shard_total_sample_size[i];
+  }
+  auto d_shard_vals =
+      memory::Alloc(place, allocate_sample_num * sizeof(int64_t));
+  int64_t* d_shard_vals_ptr = reinterpret_cast<int64_t*>(d_shard_vals->ptr());
+  move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, h_left, h_right,
+                                            d_shard_vals_ptr,
+                                            d_shard_total_sample_size);
+
+  cudaMalloc((void**)&result->val, allocate_sample_num * sizeof(int64_t));
+  cudaMalloc((void**)&result->actual_sample_size, len * sizeof(int));
+  cudaMalloc((void**)&result->offset, len * sizeof(int));
+  int64_t* val = result->val;
+  int* actual_sample_size = result->actual_sample_size;
+  int* offset = result->offset;
+
+  fill_dvalues_actual_sample_size<<<grid_size, block_size_, 0, stream>>>(
+      d_shard_actual_sample_size_ptr, actual_sample_size, d_idx_ptr, len);
+  thrust::device_vector<int> t_actual_sample_size(len);
+  thrust::copy(actual_sample_size, actual_sample_size + len,
+               t_actual_sample_size.begin());
+  thrust::exclusive_scan(t_actual_sample_size.begin(),
+                         t_actual_sample_size.end(), offset, 0);
+  int* d_offset;
+  cudaMalloc(&d_offset, len * sizeof(int));
+  thrust::copy(d_shard_actual_sample_size_ptr,
+               d_shard_actual_sample_size_ptr + len,
+               t_actual_sample_size.begin());
+  thrust::exclusive_scan(t_actual_sample_size.begin(),
+                         t_actual_sample_size.end(), d_offset, 0);
+  constexpr int BLOCK_WARPS_ = 128 / WARP_SIZE;
+  constexpr int TILE_SIZE_ = BLOCK_WARPS_ * 16;
+  const dim3 block__(WARP_SIZE, BLOCK_WARPS_);
+  const dim3 grid__((len + TILE_SIZE_ - 1) / TILE_SIZE_);
+  fill_dvalues_sample_result<BLOCK_WARPS_,
+                             TILE_SIZE_><<<grid__, block__, 0, stream>>>(
+      d_shard_vals_ptr, val, actual_sample_size, d_idx_ptr, offset, d_offset,
+      len);
+
   cudaStreamSynchronize(stream);
   for (int i = 0; i < total_gpu; ++i) {
     int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
@@ -364,6 +589,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
     }
     destroy_storage(gpu_id, i);
   }
+  cudaFree(d_offset);
   return result;
 }
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 2cf702969f99a..f85ed330dc8ea 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
+//#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
 #include <queue>
 
 namespace paddle {
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu
new file mode 100644
index 0000000000000..8c7ea10b26565
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu
@@ -0,0 +1,108 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
+#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+
+using namespace paddle::framework;
+void prepare_file(char file_name[], std::vector<std::string> data) {
+  std::ofstream ofile;
+  ofile.open(file_name);
+  for (auto x : data) {
+    ofile << x << std::endl;
+  }
+
+  ofile.close();
+}
+char edge_file_name[] = "edges.txt";
+TEST(TEST_FLEET, graph_sample) {
+  std::vector<std::string> edges;
+  int gpu_count = 3;
+  std::vector<int> dev_ids;
+  dev_ids.push_back(0);
+  dev_ids.push_back(1);
+  dev_ids.push_back(2);
+
+  std::shared_ptr<HeterPsResource> resource =
+      std::make_shared<HeterPsResource>(dev_ids);
+  resource->enable_p2p();
+  GpuPsGraphTable g(resource);
+  int node_count = 10;
+  std::vector<std::vector<int64_t>> neighbors(node_count);
+  int ind = 0;
+  int64_t node_id = 0;
+  // std::vector<GpuPsCommGraph> graph_list(gpu_count);
+  while (ind < node_count) {
+    int neighbor_size = ind + 1;
+    while (neighbor_size--) {
+      edges.push_back(std::to_string(ind) + "\t" + std::to_string(node_id) +
+                      "\t1.0");
+      node_id++;
+    }
+    ind++;
+  }
+  /*
+  gpu 0:
+  0,3,6,9
+  gpu 1:
+  1,4,7
+  gpu 2:
+  2,5,8
+
+  query(2,6) returns nodes [6,9,1,4,7,2]
+  */
+  ::paddle::distributed::GraphParameter table_proto;
+  table_proto.set_gpups_mode(true);
+  table_proto.set_gpups_mode_shard_num(127);
+  table_proto.set_gpu_num(3);
+  table_proto.set_gpups_graph_sample_class("BasicBfsGraphSampler");
+  table_proto.set_gpups_graph_sample_args("5,5,1,1");
+  prepare_file(edge_file_name, edges);
+  g.init_cpu_table(table_proto);
+  g.load(std::string(edge_file_name), std::string("e>"));
+  /*
+   node x's neighbor list = [(1+x)*x/2,(1+x)*x/2 + 1,.....,(1+x)*x/2 + x]
+   so node 6's neighbors are [21,22...,27]
+   node 7's neighbors are [28,29,..35]
+    node 0's neighbors are [0]
+   query([7,0,6],sample_size=3) should return [28,29,30,0,x,x,21,22,23]
+   6 --index-->2
+   0 --index--->0
+   7 --index-->2
+  */
+  int64_t cpu_key[3] = {7, 0, 6};
+  void *key;
+  cudaMalloc((void **)&key, 3 * sizeof(int64_t));
+  cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
+  auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3);
+  int64_t *res = new int64_t[9];
+  cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost);
+  std::sort(res, res + 3);
+  std::sort(res + 6, res + 9);
+  int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23};
+  for (int i = 0; i < 9; i++) {
+    if (expected_sample_val[i] != -1) {
+      ASSERT_EQ(res[i], expected_sample_val[i]);
+    }
+  }
+  delete[] res;
+  delete neighbor_sample_res;
+}
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_graph.cu b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu
index 697e0ba2cdf34..06c7026eb51ca 100644
--- a/paddle/fluid/framework/fleet/heter_ps/test_graph.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu
@@ -94,19 +94,44 @@ TEST(TEST_FLEET, graph_comm) {
    0 --index--->0
    7 --index-->2
   */
+
   int64_t cpu_key[3] = {7, 0, 6};
   void *key;
   cudaMalloc((void **)&key, 3 * sizeof(int64_t));
   cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
   auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3);
-  res = new int64_t[9];
-  cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost);
-  int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23};
-  for (int i = 0; i < 9; i++) {
-    if (expected_sample_val[i] != -1) {
-      ASSERT_EQ(res[i], expected_sample_val[i]);
+  res = new int64_t[7];
+  cudaMemcpy(res, neighbor_sample_res->val, 56, cudaMemcpyDeviceToHost);
+  int *actual_sample_size = new int[3];
+  cudaMemcpy(actual_sample_size, neighbor_sample_res->actual_sample_size, 12,
+             cudaMemcpyDeviceToHost);  // 3, 1, 3
+  int *cumsum_sample_size = new int[3];
+  cudaMemcpy(cumsum_sample_size, neighbor_sample_res->offset, 12,
+             cudaMemcpyDeviceToHost);  // 0, 3, 4
+
+  std::vector<std::vector<int64_t>> neighbors_;
+  std::vector<int64_t> neighbors_7 = {28, 29, 30, 31, 32, 33, 34, 35};
+  std::vector<int64_t> neighbors_0 = {0};
+  std::vector<int64_t> neighbors_6 = {21, 22, 23, 24, 25, 26, 27};
+  neighbors_.push_back(neighbors_7);
+  neighbors_.push_back(neighbors_0);
+  neighbors_.push_back(neighbors_6);
+  for (int i = 0; i < 3; i++) {
+    for (int j = cumsum_sample_size[i];
+         j < cumsum_sample_size[i] + actual_sample_size[i]; j++) {
+      bool flag = false;
+      for (int k = 0; k < neighbors_[i].size(); k++) {
+        if (res[j] == neighbors_[i][k]) {
+          flag = true;
+          break;
+        }
+      }
+      ASSERT_EQ(flag, true);
     }
   }
+
   delete[] res;
+  delete[] actual_sample_size;
+  delete[] cumsum_sample_size;
   delete neighbor_sample_res;
 }
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 31a30f72e3aa6..432e57107e84d 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -148,7 +148,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
       t.join();
     }
     timeline.Pause();
-    VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
+    VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
   } else {
     CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos);
     VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset";
@@ -182,7 +182,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
       t.join();
     }
     timeline.Pause();
-    VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
+    VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
   }
 
   timeline.Start();
@@ -300,7 +300,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
     int32_t cnt = 0;
     while (true) {
       auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
-          reinterpret_cast<char**>(local_ptr[i].data()), this->table_id_,
+          i, reinterpret_cast<char**>(local_ptr[i].data()), this->table_id_,
           local_keys[i].data(), key_size);
       bool flag = true;
 
@@ -378,8 +378,8 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
     int32_t cnt = 0;
     while (true) {
       auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
-          reinterpret_cast<char**>(local_dim_ptr[i][j].data()), this->table_id_,
-          local_dim_keys[i][j].data(), key_size);
+          i, reinterpret_cast<char**>(local_dim_ptr[i][j].data()),
+          this->table_id_, local_dim_keys[i][j].data(), key_size);
       bool flag = true;
 
       tt.wait();
@@ -431,7 +431,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
     t.join();
   }
   timeline.Pause();
-  VLOG(1) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec()
+  VLOG(0) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec()
           << " seconds.";
   if (multi_node_) {
     auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance();
@@ -603,7 +603,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
     t.join();
   }
   timeline.Pause();
-  VLOG(1) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec()
+  VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec()
           << " seconds.";
 }
 
@@ -746,7 +746,7 @@ void PSGPUWrapper::BeginPass() {
         "[BeginPass] after build_task, current task is not null."));
   }
 
-  VLOG(1) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s";
+  VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s";
 }
 
 void PSGPUWrapper::EndPass() {
@@ -769,7 +769,7 @@ void PSGPUWrapper::EndPass() {
   current_task_ = nullptr;
   gpu_free_channel_->Put(current_task_);
   timer.Pause();
-  VLOG(1) << "EndPass end, cost time: " << timer.ElapsedSec() << "s";
+  VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s";
 }
 
 void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index 91ef59575c3aa..2babecc6ddf93 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -78,6 +78,11 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
     return var_types[0] == proto::VarType::SELECTED_ROWS;
   }
 
+  bool IsDenseTensorVectorInput(const std::string& name) const override {
+    auto var_types = ctx_.GetInputsVarType(name);
+    return var_types[0] == proto::VarType::LOD_TENSOR_ARRAY;
+  }
+
   bool IsDenseTensorOutput(const std::string& name) const override {
     auto var_types = ctx_.GetOutputsVarType(name);
     return var_types[0] == proto::VarType::LOD_TENSOR;
@@ -90,6 +95,8 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
 
   bool IsForInferShape() const override { return true; }
 
+  bool IsRuntime() const override { return ctx_.IsRuntime(); }
+
  private:
   const InferShapeContext& ctx_;
 };
@@ -123,9 +130,14 @@ class CompatMetaTensor : public phi::MetaTensor {
         return var->Get<phi::DenseTensor>().dims();
       } else if (var->IsType<phi::SelectedRows>()) {
         return var->Get<phi::SelectedRows>().dims();
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // use tensor array size as dims
+        auto& tensor_array = var->Get<framework::LoDTensorArray>();
+        return phi::make_ddim({static_cast<int64_t>(tensor_array.size())});
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
-            "Currently, only can get dims from DenseTensor or SelectedRows."));
+            "Currently, only can get dims from DenseTensor or SelectedRows or "
+            "DenseTensorArray."));
       }
     } else {
       auto* var = BOOST_GET_CONST(VarDesc*, var_);
@@ -142,6 +154,10 @@ class CompatMetaTensor : public phi::MetaTensor {
         return var->Get<phi::DenseTensor>().dtype();
       } else if (var->IsType<phi::SelectedRows>()) {
         return var->Get<phi::SelectedRows>().dtype();
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // NOTE(chenweihang): do nothing
+        // Unsupported get dtype from LoDTensorArray now
+        return phi::DataType::UNDEFINED;
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Currently, only can get dtype from DenseTensor or SelectedRows."));
@@ -155,7 +171,19 @@ class CompatMetaTensor : public phi::MetaTensor {
   DataLayout layout() const override {
     if (is_runtime_) {
       auto* var = BOOST_GET_CONST(Variable*, var_);
-      return var->Get<LoDTensor>().layout();
+      if (var->IsType<phi::DenseTensor>()) {
+        return var->Get<phi::DenseTensor>().layout();
+      } else if (var->IsType<phi::SelectedRows>()) {
+        return var->Get<phi::SelectedRows>().layout();
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // NOTE(chenweihang): do nothing
+        // Unsupported get layout from LoDTensorArray now
+        return phi::DataLayout::UNDEFINED;
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Currently, only can get layout from DenseTensor or "
+            "SelectedRows."));
+      }
     } else {
       // NOTE(chenweihang): do nothing
       // Unsupported get layout for VarDesc now
@@ -172,6 +200,16 @@ class CompatMetaTensor : public phi::MetaTensor {
       } else if (var->IsType<phi::SelectedRows>()) {
         auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
         phi::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims;
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        auto* tensor_array = var->GetMutable<framework::LoDTensorArray>();
+        // Note: Here I want enforce `tensor_array->size() == 0UL`, because
+        // inplace using on LoDTensorArray is dangerous, but the unittest
+        // `test_list` contains this behavior
+        PADDLE_ENFORCE_EQ(dims.size(), 1UL,
+                          platform::errors::InvalidArgument(
+                              "LoDTensorArray can only have one dimension."));
+        // only set the array size for LoDTensorArray input
+        tensor_array->resize(dims[0]);
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Currently, only can set dims from DenseTensor or SelectedRows."));
@@ -191,6 +229,9 @@ class CompatMetaTensor : public phi::MetaTensor {
       } else if (var->IsType<phi::SelectedRows>()) {
         auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
         phi::DenseTensorUtils::GetMutableMeta(tensor)->dtype = dtype;
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // NOTE(chenweihang): do nothing
+        // Unsupported set dtype for LoDTensorArray now
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Currently, only can set dtype from DenseTensor or SelectedRows."));
@@ -204,10 +245,20 @@ class CompatMetaTensor : public phi::MetaTensor {
   void set_layout(DataLayout layout) override {
     if (is_runtime_) {
       auto* var = BOOST_GET(Variable*, var_);
-      LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      phi::DenseTensorUtils::GetMutableMeta(
-          static_cast<phi::DenseTensor*>(tensor))
-          ->layout = layout;
+      if (var->IsType<phi::DenseTensor>()) {
+        auto* tensor = var->GetMutable<phi::DenseTensor>();
+        phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout;
+      } else if (var->IsType<phi::SelectedRows>()) {
+        auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
+        phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout;
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // NOTE(chenweihang): do nothing
+        // Unsupported set dtype for LoDTensorArray now
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Currently, only can set layout from DenseTensor or "
+            "SelectedRows."));
+      }
     } else {
       // NOTE(chenweihang): do nothing
       // Unsupported set layout for VarDesc now
@@ -247,13 +298,11 @@ class CompatMetaTensor : public phi::MetaTensor {
   }
 
   void share_meta(const MetaTensor& meta_tensor) override {
+    share_dims(meta_tensor);
     set_dtype(meta_tensor.dtype());
-    // VarDesc doesn't contains layout, so we cannot share layout
-    // set_layout(meta_tensor.layout());
-
-    // special case 1: share lod of LoDTensor
+    set_layout(meta_tensor.layout());
+    // special case: share lod of LoDTensor
     share_lod(meta_tensor);
-    share_dims(meta_tensor);
   }
 
  private:
@@ -295,7 +344,8 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
   VLOG(3) << "BuildInferMetaContext: op kernel signature - " << signature;
 
   // 2. build infermeta context
-  phi::InferMetaContext infer_meta_context(ctx->IsRuntime());
+  phi::InferMetaContext infer_meta_context(
+      {ctx->IsRuntime(), ctx->IsRunMKLDNNKernel()});
 
   auto& input_names = std::get<0>(signature.args);
   auto& attr_names = std::get<1>(signature.args);
@@ -439,6 +489,51 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
               attr_name, infershape_input.size()));
         }
       }
+    } else if (attr_defs[i].type_index ==
+               std::type_index(typeid(std::vector<phi::Scalar>))) {
+      auto& attr = attr_reader.GetAttr(attr_name);
+      if (std::type_index(attr.type()) ==
+          std::type_index(typeid(std::vector<int32_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<int64_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<float>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<double>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported cast op attribute `%s` to vector<Scalar> when "
+            "construct InferMetaContext.",
+            attr_names[i]));
+      }
     } else if (ctx->HasAttr(attr_name)) {
       // Emplace Back Attr according to the type of attr.
       auto& attr = attr_reader.GetAttr(attr_name);
@@ -497,8 +592,22 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
             "Unsupported attribute type is received when call "
             "InferShapeFunctor."));
       }
-    } else {
-      // do nothing
+    } else if (ctx->HasInput(attr_name)) {
+      // convert from data
+      if (attr_defs[i].type_index == std::type_index(typeid(int32_t))) {
+        if (ctx->IsRuntime()) {
+          const auto& infershape_inputs = ctx->GetInputVarPtrs(attr_name);
+          auto var_temp = BOOST_GET_CONST(Variable*, infershape_inputs[i]);
+          auto val = experimental::MakePhiScalarFromVar(*var_temp);
+          int32_t val_int = val.template to<int32_t>();
+          infer_meta_context.EmplaceBackAttr(val_int);
+        } else {
+          infer_meta_context.EmplaceBackAttr(-1);
+        }
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Get value from variable only support int yet"));
+      }
     }
   }
 
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index a1f2d6edca6a2..7aaaef712a6e9 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -97,6 +97,7 @@ pass_library(layer_norm_fuse_pass inference)
 pass_library(add_support_int8_pass inference)
 pass_library(matmul_scale_fuse_pass inference)
 pass_library(gpu_cpu_map_matmul_to_mul_pass inference)
+pass_library(mixed_precision_configure_pass inference)
 pass_library(generate_pass DEPS pass_desc_proto)
 target_link_libraries(generate_pass pass_desc_proto)
 
@@ -126,6 +127,7 @@ if(WITH_MKLDNN)
     pass_library(interpolate_mkldnn_pass inference DIR mkldnn)
     pass_library(softplus_activation_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn)
+    pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(cpu_quantize_placement_pass base DIR mkldnn)
     pass_library(cpu_quantize_pass inference DIR mkldnn)
     pass_library(cpu_quantize_squash_pass inference DIR mkldnn)
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 036fde8fac6d9..f5f6f3ecb855c 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -95,6 +95,7 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromBlock(
   std::unordered_map<std::string, std::pair<VarDesc *, int>>
       name_to_desc_block_id;
 
+  block_id_ = block.ID();
   const BlockDesc *block_var_visible = &block;
   while (block_var_visible != nullptr) {
     for (auto *var : block_var_visible->AllVars()) {
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 21e743e3587d8..10645f08dc3ba 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -230,6 +230,7 @@ class Graph {
     auto *x =
         AddNode(new ir::Node(var_desc, block_id == -1 ? block_id_ : block_id));
     x->SetId(num_node_created_++);
+    x->SetGraphId(block_id_);
     return x;
   }
 
@@ -245,6 +246,7 @@ class Graph {
                      "The OpDesc used to create operator node is null."));
     auto *x = AddNode(new ir::Node(op_desc));
     x->SetId(num_node_created_++);
+    x->SetGraphId(block_id_);
     return x;
   }
 
@@ -263,6 +265,7 @@ class Graph {
         num_node_created_);
     auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable, block_id_));
     x->SetId(num_node_created_++);
+    x->SetGraphId(block_id_);
     return x;
   }
 
@@ -276,6 +279,7 @@ class Graph {
     }
     auto *x = AddNode(new ir::Node(name, type, block_id_));
     x->SetId(num_node_created_++);
+    x->SetGraphId(block_id_);
     return x;
   }
 
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index d7d866fa98bb5..164a13d1560f4 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -918,6 +918,36 @@ PDNode *patterns::ConvActivation::operator()(
   return activation_out_var;
 }
 
+PDNode *patterns::ElementwiseActivation::operator()(
+    paddle::framework::ir::PDNode *elementwise_a,
+    const std::string &elementwise_type, const std::string &activation_type) {
+  // Create Operators
+  elementwise_a->assert_is_op_input(elementwise_type, "X");
+  auto *elementwise_op =
+      pattern->NewNode(elementwise_repr())->assert_is_op(elementwise_type);
+  auto *activation_op =
+      pattern->NewNode(activation_repr())->assert_is_op(activation_type);
+  // Create variables
+  auto *elementwise_b = pattern->NewNode(elementwise_b_repr())
+                            ->AsInput()
+                            ->assert_is_op_input(elementwise_type, "Y");
+  // intermediate variable, will be removed in the IR after fuse.
+  auto *elementwise_out_var =
+      pattern->NewNode(elementwise_out_repr())
+          ->AsIntermediate()
+          ->assert_is_only_output_of_op(elementwise_type)
+          ->assert_is_op_input(activation_type);
+  // output
+  auto *activation_out_var = pattern->NewNode(activation_out_repr())
+                                 ->AsOutput()
+                                 ->assert_is_op_output(activation_type);
+
+  elementwise_op->LinksFrom({elementwise_a, elementwise_b})
+      .LinksTo({elementwise_out_var});
+  activation_op->LinksFrom({elementwise_out_var}).LinksTo({activation_out_var});
+  return activation_out_var;
+}
+
 PDNode *patterns::SeqConvEltAddRelu::operator()(
     paddle::framework::ir::PDNode *seqconv_input) {
   // Create Operators
@@ -2022,18 +2052,19 @@ PDNode *patterns::Pool::operator()() {
   return output_var;
 }
 
-PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
-  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
-                                ->assert_is_op("elementwise_add");
+PDNode *patterns::Elementwise::operator()(PDNode *x_var, PDNode *y_var,
+                                          const std::string elementwise_type) {
+  auto elementwise_op =
+      pattern->NewNode(elementwise_op_repr())->assert_is_op(elementwise_type);
 
-  x_var->AsInput()->assert_is_op_input("elementwise_add", "X");
-  y_var->AsInput()->assert_is_op_input("elementwise_add", "Y");
-  auto out_var = pattern->NewNode(elementwise_add_out_repr())
+  x_var->AsInput()->assert_is_op_input(elementwise_type, "X");
+  y_var->AsInput()->assert_is_op_input(elementwise_type, "Y");
+  auto out_var = pattern->NewNode(elementwise_out_repr())
                      ->AsOutput()
-                     ->assert_is_op_output("elementwise_add", "Out");
+                     ->assert_is_op_output(elementwise_type, "Out");
 
-  elementwise_add_op->LinksFrom({x_var, y_var});
-  elementwise_add_op->LinksTo({out_var});
+  elementwise_op->LinksFrom({x_var, y_var});
+  elementwise_op->LinksTo({out_var});
 
   return out_var;
 }
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 0f21906d08d0e..17c70ace301d3 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -487,6 +487,28 @@ struct ConvActivation : public PatternBase {
   PATTERN_DECL_NODE(activation_out);
 };
 
+// Elementwise with Activation
+// op: elementwise + activation
+// named nodes:
+// elementwise_a, elementwise_b,
+// elementwise_out, elementwise,
+// activation_out, activation
+struct ElementwiseActivation : public PatternBase {
+  ElementwiseActivation(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "elementwise_add_activation") {}
+
+  PDNode* operator()(PDNode* elementwise_a, const std::string& elementwise_type,
+                     const std::string& activation_type);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(elementwise);
+  PATTERN_DECL_NODE(activation);
+  // declare variable node's name
+  PATTERN_DECL_NODE(elementwise_b);
+  PATTERN_DECL_NODE(elementwise_out);
+  PATTERN_DECL_NODE(activation_out);
+};
+
 // SEQCONV with Elementwise_Add ReLU
 // op: seqconv + elementwise_add + relu
 // named nodes:
@@ -994,20 +1016,20 @@ struct Pool : public PatternBase {
   PATTERN_DECL_NODE(pool_output);
 };
 
-// ElementwiseAdd used in residual connections.
-// y_var is used and convolution output.
-// The operator is removed, when residual
-// connection fusion is on.
-struct ElementwiseAdd : public PatternBase {
-  ElementwiseAdd(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "elementwise_add") {}
+// Elementwise ops
+// Forward pass for element-wise operators (add, mul)
+// elementwise_mul_out is the result of the operator
+struct Elementwise : public PatternBase {
+  Elementwise(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "elementwise") {}
 
-  PDNode* operator()(PDNode* x_var, PDNode* y_var);
+  PDNode* operator()(PDNode* x_var, PDNode* y_var,
+                     const std::string elementwise_type);
 
-  PATTERN_DECL_NODE(elementwise_add_op);
-  PATTERN_DECL_NODE(elementwise_add_x);
-  PATTERN_DECL_NODE(elementwise_add_y);
-  PATTERN_DECL_NODE(elementwise_add_out);
+  PATTERN_DECL_NODE(elementwise_op);
+  PATTERN_DECL_NODE(elementwise_x);
+  PATTERN_DECL_NODE(elementwise_y);
+  PATTERN_DECL_NODE(elementwise_out);
 };
 
 // Transpose op
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
index 9fe50deaf2d72..7cdb7a8854aad 100644
--- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
@@ -25,14 +25,14 @@ std::set<std::string> ignored_ops = {
     "sum",
     "clip",
     "clip_by_norm",
-    "square",
     "reduce_sum",
     "sqrt",
     "elementwise_max",
     "elementwise_div",
     "elementwise_mul",
-    "scale",   // adamax
-    "assign",  // adamw
+    "scale",           // adamax
+    "assign",          // adamw
+    "squared_l2_norm"  // gradient_clip_norm
 };
 
 const bool startswith(const std::string& str, const std::string& pre) {
@@ -62,6 +62,10 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
   new_op.SetAttr("with_lr_sched", false);
 
   std::set<std::string> set_ops{};
+  // save the weight decay tensor_name and weight_decay_value for Lamb
+  std::vector<std::string> weight_decay_vars{};
+  std::vector<float> weight_decay_values{};
+
   // use map store <op_type, op_ptr> ?
   for (auto* node : graph->Nodes()) {
     if (!node->IsOp()) {
@@ -75,6 +79,15 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
     auto op_role = static_cast<OpRole>(op_role_);
 
     if (op_role == OpRole::kOptimize) {
+      // save weight decay value from every lamb optimizer op
+      if (op_type == "lamb" && op->HasAttr("weight_decay")) {
+        auto weight_decay_value =
+            BOOST_GET_CONST(float, op->GetAttr("weight_decay"));
+        auto params = op->Output("ParamOut");
+        weight_decay_vars.push_back(params[0]);
+        weight_decay_values.push_back(weight_decay_value);
+      }
+
       if (set_ops.count(op_type)) {
         continue;
       }
@@ -270,7 +283,10 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
   // seems with_lr_sched is always true
   new_op.SetAttr("with_lr_sched", true);
 
-  // setup weight deacy
+  // setup weight decay for Lamb
+  new_op.SetAttr("weight_decay_vars", weight_decay_vars);
+  new_op.SetAttr("weight_decay_values", weight_decay_values);
+
   // weight_decay/coeff is "scale" attr of scale_op
   if (set_ops.count("scale") && set_ops.count("sum")) {
     if (set_ops.count("sign")) {
diff --git a/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc b/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc
index e754ba72ad857..5cd8358dc083e 100644
--- a/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc
@@ -30,7 +30,8 @@ void TransferCastOpPass::ApplyImpl(ir::Graph* graph) const {
 
   auto ipu_backend = platform::ipu::IpuBackend::GetInstance();
   auto enable_fp16 = ipu_backend->GetIpuStrategy()->enable_fp16;
-  if (enable_fp16) {
+  auto transfer_cast_op = ipu_backend->GetIpuStrategy()->transfer_cast_op;
+  if (enable_fp16 && transfer_cast_op) {
     for (auto* node : graph->Nodes()) {
       if (node->IsOp() && node->Op()->Type() == "popart_cast") {
         if (BOOST_GET_CONST(std::string, node->Op()->GetAttr("to")) ==
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
index 1b2a62695fb13..9fc6de3c8c172 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
@@ -73,8 +73,10 @@ static void ShareVarInfoToCinnLaunch(
       varinfo_maps.at(cinn_launch_op->GetScopeIdx());
 
   // collect all MemOptVarInfos of external variables
-  // that would be eager deleted after the cinn_launch subgraph executed,
-  // and store them as attribute of the subgraph
+  // that were eager deleted after the cinn_launch subgraph executed,
+  // and we will delete them in advance among eager_deletion_ops
+  // inside cinn_launch subgraph, so store them as attribute of the subgraph
+  // to pass to the inner eager_deletion_ops.
   for (const auto& var_name : vars_to_delete) {
     auto it = src_varinfo_map.find(var_name);
     PADDLE_ENFORCE_NE(it, src_varinfo_map.end(),
@@ -82,6 +84,8 @@ static void ShareVarInfoToCinnLaunch(
                           "MemOptVarInfo of var[%s] not found", var_name));
     dst_varinfo_map.emplace(var_name, it->second);
   }
+  // skip running of the followed eager_deletion_op
+  followed_eager_deletion_op->SetSkipRunning(true);
 }
 
 static void TakeVarInfoFromMainGraph(
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
index d33dc7f49feb0..636a594a657cb 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
@@ -20,12 +20,15 @@
 #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(scale);
 USE_OP(elementwise_mul);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(elementwise_add_grad);
 
+PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
+
 DECLARE_double(eager_delete_tensor_gb);
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc
new file mode 100644
index 0000000000000..4aa59d9196b1b
--- /dev/null
+++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc
@@ -0,0 +1,149 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mixed_precision_configure_pass.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void MixedPrecisionConfigurePass::InsertCastOps(
+    Graph* graph, const StringSet& blacklist) const {
+  VLOG(3) << "Insert the cast op before and after the kernel that does not "
+             "supports fp16 precision";
+
+  auto update_cast_desc = [&](
+      framework::OpDesc& desc, const std::string& x_name,
+      const std::string& out_name, const int in_dtype, const int out_dtype) {
+    desc.SetType("cast");
+    desc.SetInput("X", {x_name});
+    desc.SetOutput("Out", {out_name});
+    desc.SetAttr("in_dtype", in_dtype);
+    desc.SetAttr("out_dtype", out_dtype);
+    desc.SetAttr("use_mkldnn", false);
+    desc.SetAttr("with_quant_attr", false);
+    desc.Flush();
+  };
+
+  auto cast_input = [&](Graph* graph, Node* op_node,
+                        const StringSet& cast_list) {
+    auto inlinks = op_node->inputs;
+    for (auto* pre_node : inlinks) {
+      if (pre_node->IsVar()) {
+        const auto is_persistable = pre_node->Var()->Persistable();
+        const auto is_float =
+            pre_node->Var()->GetDataType() == proto::VarType::FP16 ||
+            pre_node->Var()->GetDataType() == proto::VarType::FP32 ||
+            pre_node->Var()->GetDataType() == proto::VarType::FP64;
+        if (!is_persistable && is_float) {
+          int suffix = 0;
+          for (auto* pre_node_input : pre_node->inputs) {
+            if (!pre_node_input->IsOp()) continue;
+            const auto& type = pre_node_input->Op()->Type();
+            if (!cast_list.count(type) && type != "cast") {
+              std::string old_name = pre_node->Name();
+              std::string new_name =
+                  old_name + "_cast.tmp_" + std::to_string(suffix);
+              suffix++;
+
+              framework::OpDesc new_op_desc(op_node->Op()->Block());
+              // 4 for fp16, 5 for fp32
+              update_cast_desc(new_op_desc, old_name, new_name, 4, 5);
+              auto* new_op = graph->CreateOpNode(&new_op_desc);
+
+              VarDesc out_var(new_name);
+              out_var.SetPersistable(false);
+              auto* node_var = graph->CreateVarNode(&out_var);
+
+              op_node->Op()->RenameInput(old_name, new_name);
+              IR_NODE_LINK_TO(pre_node, new_op);
+              IR_NODE_LINK_TO(new_op, node_var);
+              IR_NODE_LINK_TO(node_var, op_node);
+            }
+          }
+        }
+      }
+    }
+  };
+
+  auto cast_output = [&](Graph* graph, Node* op_node,
+                         const StringSet& cast_list) {
+    auto outlinks = op_node->outputs;
+    for (auto* next_node : outlinks) {
+      if (next_node->IsVar()) {
+        const auto is_persistable = next_node->Var()->Persistable();
+        const auto is_float =
+            next_node->Var()->GetDataType() == proto::VarType::FP16 ||
+            next_node->Var()->GetDataType() == proto::VarType::FP32 ||
+            next_node->Var()->GetDataType() == proto::VarType::FP64;
+        if (!is_persistable && is_float) {
+          int suffix = 0;
+          for (auto* next_node_output : next_node->outputs) {
+            if (!next_node_output->IsOp()) continue;
+
+            const auto& type = next_node_output->Op()->Type();
+            if (!cast_list.count(type) && type != "cast") {
+              std::string old_name = next_node->Name();
+              std::string new_name =
+                  old_name + "_cast.tmp_" + std::to_string(suffix);
+              suffix++;
+
+              framework::OpDesc new_op_desc(op_node->Op()->Block());
+              // 4 for fp16, 5 for fp32
+              update_cast_desc(new_op_desc, old_name, new_name, 5, 4);
+              auto* new_op = graph->CreateOpNode(&new_op_desc);
+
+              VarDesc out_var(new_name);
+              out_var.SetPersistable(false);
+              auto* node_var = graph->CreateVarNode(&out_var);
+
+              next_node_output->Op()->RenameInput(old_name, new_name);
+              IR_NODE_LINK_TO(next_node, new_op);
+              IR_NODE_LINK_TO(new_op, node_var);
+              IR_NODE_LINK_TO(node_var, next_node_output);
+            }
+          }
+        }
+      }
+    }
+  };
+
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
+        op_node->Op()->Type() == "fetch")
+      continue;
+
+    const auto& type = op_node->Op()->Type();
+    if (blacklist.count(type)) {
+      cast_input(graph, op_node, blacklist);
+      cast_output(graph, op_node, blacklist);
+    }
+  }
+}
+
+void MixedPrecisionConfigurePass::ApplyImpl(Graph* graph) const {
+  const auto blacklist =
+      Get<std::unordered_set<std::string>>("gpu_fp16_disabled_op_types");
+  InsertCastOps(graph, blacklist);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(mixed_precision_configure_pass,
+              paddle::framework::ir::MixedPrecisionConfigurePass);
diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.h b/paddle/fluid/framework/ir/mixed_precision_configure_pass.h
new file mode 100644
index 0000000000000..fc5a612ecb833
--- /dev/null
+++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using StringSet = std::unordered_set<std::string>;
+
+class MixedPrecisionConfigurePass : public FusePassBase {
+ public:
+  MixedPrecisionConfigurePass() = default;
+  virtual ~MixedPrecisionConfigurePass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const override;
+
+ private:
+  void InsertCastOps(Graph* graph, const StringSet& blacklist) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
index c537d05738529..fc2758c273450 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -22,6 +22,7 @@
 
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
 namespace framework {
@@ -117,7 +118,7 @@ ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() {
       .IsType<std::vector<int>>()
       .End()
       .AddAttr("data_format")
-      .IsStringIn({"NCHW", "AnyLayout"})
+      .IsStringIn({"NHWC", "NCHW", "AnyLayout"})
       .End();
 
   AddOpCompat(OpCompat("elementwise_add"))
@@ -135,226 +136,138 @@ ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() {
       .End();
 }
 
-ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::IdentityFuseHandle(
-    const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func,
-    const ResidualConnectionMKLDNNFusePass::IdentityConvFunc&
-        get_node_from_conv_op,
-    const ResidualConnectionMKLDNNFusePass::IdentityElementwiseAddFunc&
-        get_node_from_elementwise_add_op,
-    const ResidualConnectionMKLDNNFusePass* pass)
-    : fusion_stats{std::make_shared<int>(0)},
-      can_fuse_func{can_fuse_func},
-      get_node_from_conv_op{get_node_from_conv_op},
-      get_node_from_elementwise_add_op{get_node_from_elementwise_add_op},
-      pass_{pass} {}
-
-void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()(
-    const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-  Node* conv_op;
-  Node* conv_input;
-  Node* conv_filter;
-  Node* conv_output;
-
-  Node* elementwise_add_op;
-  Node* elementwise_add_identity;
-  Node* elementwise_add_out;
-
-  std::tie(conv_op, conv_input, conv_filter, conv_output) =
-      get_node_from_conv_op(subgraph);
-  std::tie(elementwise_add_op, elementwise_add_identity, elementwise_add_out) =
-      get_node_from_elementwise_add_op(subgraph);
-
-  if (!can_fuse_func(conv_op, elementwise_add_op)) return;
-
-  if (!IsReachable(graph, elementwise_add_identity, conv_output)) return;
-
-  if (HasFusedActivation(conv_op)) return;
-
-  if (!pass_->IsCompat(subgraph, graph)) {
-    LOG(WARNING)
-        << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
-    return;
-  }
+GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
+    const std::string& name_scope,
+    const GraphWithStats& graph_with_stats) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
 
-  conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()});
-  conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
-  conv_op->Op()->SetAttr("fuse_residual_connection", true);
+  patterns::Conv conv_pattern{pattern, name_scope};
+  auto conv_output = conv_pattern();
 
-  GraphSafeRemoveNodes(graph, {conv_output, elementwise_add_op});
+  patterns::Elementwise elementwise_pattern{pattern, name_scope};
+  elementwise_pattern(
+      conv_output, pattern->NewNode(elementwise_pattern.elementwise_y_repr()),
+      "elementwise_add");
+  conv_output->AsIntermediate();
 
-  IR_NODE_LINK_TO(elementwise_add_identity, conv_op);
-  IR_NODE_LINK_TO(conv_op, elementwise_add_out);
+  int found_conv_as_x_count = 0;
 
-  (*fusion_stats)++;
-}
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
 
-ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::ProjectionFuseHandle(
-    const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func,
-    const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc&
-        get_node_from_conv_x_op,
-    const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc&
-        get_node_from_conv_y_op,
-    const ResidualConnectionMKLDNNFusePass::ProjectionElementwiseAddFunc&
-        get_node_from_elementwise_add_op,
-    const ResidualConnectionMKLDNNFusePass* pass)
-    : fusion_stats{std::make_shared<int>(0)},
-      can_fuse_func{can_fuse_func},
-      get_node_from_conv_x_op{get_node_from_conv_x_op},
-      get_node_from_conv_y_op{get_node_from_conv_y_op},
-      get_node_from_elementwise_add_op{get_node_from_elementwise_add_op},
-      pass_{pass} {}
-
-void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()(
-    const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-  Node* conv_x_op;
-  Node* conv_x_input;
-  Node* conv_x_filter;
-  Node* conv_x_output;
-
-  Node* conv_y_op;
-  Node* conv_y_input;
-  Node* conv_y_filter;
-  Node* conv_y_output;
-
-  Node* elementwise_add_op;
-  Node* elementwise_add_out;
-
-  if (!pass_->IsCompat(subgraph, graph)) {
-    LOG(WARNING)
-        << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
-    return;
-  }
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_identity, elementwise_y,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
+                              elementwise_pattern);
 
-  std::tie(conv_x_op, conv_x_input, conv_x_filter, conv_x_output) =
-      get_node_from_conv_x_op(subgraph);
-  std::tie(conv_y_op, conv_y_input, conv_y_filter, conv_y_output) =
-      get_node_from_conv_y_op(subgraph);
-  std::tie(elementwise_add_op, elementwise_add_out) =
-      get_node_from_elementwise_add_op(subgraph);
-
-  if (!can_fuse_func(conv_x_op, elementwise_add_op)) return;
-  if (!can_fuse_func(conv_y_op, elementwise_add_op)) return;
-
-  Node* projection_node;
-  Node* residual_conv_op;
-  Node* residual_conv_output;
-
-  if (IsReachable(graph, conv_x_input, conv_y_output)) {
-    projection_node = conv_x_output;
-    residual_conv_op = conv_y_op;
-    residual_conv_output = conv_y_output;
-  } else if (IsReachable(graph, conv_y_input, conv_x_output)) {
-    projection_node = conv_y_output;
-    residual_conv_op = conv_x_op;
-    residual_conv_output = conv_x_output;
-  } else {
-    return;
-  }
+    if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return;
 
-  if (HasFusedActivation(residual_conv_op)) return;
+    if (!IsReachable(g, elementwise_identity, conv_output)) return;
 
-  residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()});
-  residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
+    if (HasFusedActivation(conv_op)) return;
 
-  residual_conv_op->Op()->SetAttr("fuse_residual_connection", true);
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING)
+          << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
+      return;
+    }
 
-  GraphSafeRemoveNodes(graph, {residual_conv_output, elementwise_add_op});
+    conv_op->Op()->SetInput("ResidualData", {elementwise_identity->Name()});
+    conv_op->Op()->SetOutput("Output", {elementwise_out->Name()});
+    conv_op->Op()->SetAttr("fuse_residual_connection", true);
 
-  IR_NODE_LINK_TO(projection_node, residual_conv_op);
-  IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out);
+    GraphSafeRemoveNodes(g, {conv_output, elementwise_op});
 
-  (*fusion_stats)++;
-}
+    IR_NODE_LINK_TO(elementwise_identity, conv_op);
+    IR_NODE_LINK_TO(conv_op, elementwise_out);
 
-std::tuple<Node*, Node*, Node*, Node*>
-ResidualConnectionMKLDNNFusePass::GetNodesFromConv(
-    const patterns::Conv& conv_pattern,
-    const GraphPatternDetector::subgraph_t& subgraph) const {
-  GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
-  GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
-  GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
-  GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
+    found_conv_as_x_count++;
+  };
+
+  gpd(graph_with_stats.first, handler);
+  if (!Has("disable_logs") || !Get<bool>("disable_logs")) {
+    std::stringstream msg_ss;
+    msg_ss << "---    Fused " << found_conv_as_x_count
+           << " conv (as x) + elementwise_add patterns";
+    paddle::string::PrettyLogDetail(msg_ss.str().c_str());
+  }
 
-  return std::make_tuple(conv_op, conv_input, conv_filter, conv_output);
+  return std::make_pair(graph_with_stats.first,
+                        found_conv_as_x_count + graph_with_stats.second);
 }
 
-GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
+GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
     const std::string& name_scope,
     const GraphWithStats& graph_with_stats) const {
-  ir::Graph* graph;
-  int stats;
-
-  std::tie(graph, stats) = graph_with_stats;
-
   GraphPatternDetector gpd;
   auto pattern = gpd.mutable_pattern();
 
   patterns::Conv conv_pattern{pattern, name_scope};
   auto conv_output = conv_pattern();
 
-  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
-  elementwise_add_pattern(
-      conv_output,
-      pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr()));
+  patterns::Elementwise elementwise_pattern{pattern, name_scope};
+  elementwise_pattern(
+      pattern->NewNode(elementwise_pattern.elementwise_x_repr()), conv_output,
+      "elementwise_add");
   conv_output->AsIntermediate();
 
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
-      -> std::tuple<Node*, Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
-
-        return std::make_tuple(elementwise_add_op, elementwise_add_y,
-                               elementwise_add_out);
-      };
-
-  return ExecuteHandleOnGraph<IdentityFuseHandle>(
-      &gpd, graph_with_stats,
-      [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_pattern, subgraph);
-      },
-      get_node_from_elementwise_add, this);
-}
+  int found_conv_as_y_count = 0;
 
-GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
-    const std::string& name_scope,
-    const GraphWithStats& graph_with_stats) const {
-  GraphPatternDetector gpd;
-  auto pattern = gpd.mutable_pattern();
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
 
-  patterns::Conv conv_pattern{pattern, name_scope};
-  auto conv_output = conv_pattern();
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_x, elementwise_x,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
+                              elementwise_pattern);
 
-  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
-  elementwise_add_pattern(
-      pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()),
-      conv_output);
-  conv_output->AsIntermediate();
+    if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return;
+
+    if (!IsReachable(g, elementwise_x, conv_output)) return;
+
+    if (HasFusedActivation(conv_op)) return;
+
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING)
+          << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
+      return;
+    }
+
+    conv_op->Op()->SetInput("ResidualData", {elementwise_x->Name()});
+    conv_op->Op()->SetOutput("Output", {elementwise_out->Name()});
+    conv_op->Op()->SetAttr("fuse_residual_connection", true);
+
+    GraphSafeRemoveNodes(g, {conv_output, elementwise_op});
+
+    IR_NODE_LINK_TO(elementwise_x, conv_op);
+    IR_NODE_LINK_TO(conv_op, elementwise_out);
+
+    found_conv_as_y_count++;
+  };
+
+  gpd(graph_with_stats.first, handler);
+  if (!Has("disable_logs") || !Get<bool>("disable_logs")) {
+    std::stringstream msg_ss;
+    msg_ss << "---    Fused " << found_conv_as_y_count
+           << " conv (as y) + elementwise_add patterns";
+    paddle::string::PrettyLogDetail(msg_ss.str().c_str());
+  }
 
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
-      -> std::tuple<Node*, Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
-
-        return std::make_tuple(elementwise_add_op, elementwise_add_x,
-                               elementwise_add_out);
-      };
-
-  return ExecuteHandleOnGraph<IdentityFuseHandle>(
-      &gpd, graph_with_stats,
-      [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_pattern, subgraph);
-      },
-      get_node_from_elementwise_add, this);
+  return std::make_pair(graph_with_stats.first,
+                        found_conv_as_y_count + graph_with_stats.second);
 }
 
 GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
@@ -369,44 +282,89 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
   patterns::Conv conv_y_pattern{pattern, name_scope};
   auto conv_y_output = conv_y_pattern();
 
-  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
-  elementwise_add_pattern(conv_x_output, conv_y_output);
+  patterns::Elementwise elementwise_pattern{pattern, name_scope};
+  elementwise_pattern(conv_x_output, conv_y_output, "elementwise_add");
   conv_x_output->AsIntermediate();
   conv_y_output->AsIntermediate();
 
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
-      -> std::tuple<Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
-
-        return std::make_tuple(elementwise_add_op, elementwise_add_out);
-      };
-
-  return ExecuteHandleOnGraph<ProjectionFuseHandle>(
-      &gpd, graph_with_stats,
-      [this,
-       &conv_x_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_x_pattern, subgraph);
-      },
-      [this,
-       &conv_y_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_y_pattern, subgraph);
-      },
-      get_node_from_elementwise_add, this);
+  int found_projection_conv_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(conv_x_op, conv_op, conv_x_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_x_input, conv_input, conv_x_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_x_filter, conv_filter, conv_x_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_x_output, conv_output, conv_x_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(conv_y_op, conv_op, conv_y_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_y_input, conv_input, conv_y_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_y_filter, conv_filter, conv_y_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_y_output, conv_output, conv_y_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
+                              elementwise_pattern);
+
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING)
+          << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
+      return;
+    }
+
+    if (FindFuseOption(*conv_x_op, *elementwise_op) != FUSE_MKLDNN) return;
+    if (FindFuseOption(*conv_y_op, *elementwise_op) != FUSE_MKLDNN) return;
+
+    Node* projection_node;
+    Node* residual_conv_op;
+    Node* residual_conv_output;
+    if (IsReachable(g, conv_x_input, conv_y_output)) {
+      projection_node = conv_x_output;
+      residual_conv_op = conv_y_op;
+      residual_conv_output = conv_y_output;
+    } else if (IsReachable(g, conv_y_input, conv_x_output)) {
+      projection_node = conv_y_output;
+      residual_conv_op = conv_x_op;
+      residual_conv_output = conv_x_output;
+    } else {
+      return;
+    }
+
+    if (HasFusedActivation(residual_conv_op)) return;
+
+    residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()});
+    residual_conv_op->Op()->SetOutput("Output", {elementwise_out->Name()});
+
+    residual_conv_op->Op()->SetAttr("fuse_residual_connection", true);
+
+    GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_op});
+
+    IR_NODE_LINK_TO(projection_node, residual_conv_op);
+    IR_NODE_LINK_TO(residual_conv_op, elementwise_out);
+
+    found_projection_conv_count++;
+  };
+
+  gpd(graph_with_stats.first, handler);
+  if (!Has("disable_logs") || !Get<bool>("disable_logs")) {
+    std::stringstream msg_ss;
+    msg_ss << "---    Fused " << found_projection_conv_count
+           << " projection conv (as y) + elementwise_add patterns";
+    paddle::string::PrettyLogDetail(msg_ss.str().c_str());
+  }
+
+  return std::make_pair(graph_with_stats.first,
+                        found_projection_conv_count + graph_with_stats.second);
 }
 
-void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
+void ResidualConnectionMKLDNNFusePass::ApplyImpl(ir::Graph* graph) const {
   FusePassBase::Init(name_scope_, graph);
-  auto fused_graph_with_stats = FuseConvAsY(
-      name_scope_,
-      FuseConvAsX(name_scope_,
-                  FuseProjectionConv(name_scope_, std::make_pair(graph, 0))));
+  auto graph_with_stats =
+      FuseProjectionConv(name_scope_, std::make_pair(graph, 0));
+  graph_with_stats = FuseConvAsX(name_scope_, graph_with_stats);
+  graph_with_stats = FuseConvAsY(name_scope_, graph_with_stats);
 
-  LOG(INFO) << "Fused graph " << fused_graph_with_stats.second << "\n";
-  AddStatis(fused_graph_with_stats.second);
+  AddStatis(graph_with_stats.second);
 }
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
index c83335da2f629..c4351b382187d 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
@@ -28,19 +28,9 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-class Graph;
-class GraphPatternDetector;
-class Node;
-namespace patterns {
-struct Conv;
-}  // namespace patterns
-
-using graph_ptr = ir::Graph*;
 using GraphWithStats = std::pair<ir::Graph*, int>;
 
-void CorrectGraphEdges(Graph* graph, Node* from, Node* to);
 bool IsReachable(ir::Graph* graph, Node* from, Node* to);
-paddle::optional<Node*> HasBias(const Node& op, const std::string& bias_name);
 
 class ResidualConnectionMKLDNNFusePass : public FusePassBase {
  private:
@@ -52,91 +42,13 @@ class ResidualConnectionMKLDNNFusePass : public FusePassBase {
       const std::string& name_scope,
       const GraphWithStats& graph_with_stats) const;
 
-  template <typename RetType>
-  using GetNodeFunc =
-      std::function<RetType(const GraphPatternDetector::subgraph_t& subgraph)>;
-  using IdentityConvFunc = GetNodeFunc<std::tuple<Node*, Node*, Node*, Node*>>;
-  using IdentityElementwiseAddFunc =
-      GetNodeFunc<std::tuple<Node*, Node*, Node*>>;
-
-  using ProjectionConvFunc = IdentityConvFunc;
-  using ProjectionElementwiseAddFunc = GetNodeFunc<std::tuple<Node*, Node*>>;
-
-  using CanFuseFunc = std::function<bool(Node*, Node*)>;
-
-  std::tuple<Node*, Node*, Node*, Node*> GetNodesFromConv(
-      const patterns::Conv& conv_pattern,
-      const GraphPatternDetector::subgraph_t& subgraph) const;
-
-  std::tuple<Node*, Node*, Node*, Node*> GetNodesFromProjectionConv(
-      const patterns::Conv& conv_pattern,
-      const GraphPatternDetector::subgraph_t& subgraph) const;
-
-  template <typename HandleType, typename... OpFuncs>
-  GraphWithStats ExecuteHandleOnGraph(GraphPatternDetector* gpd,
-                                      const GraphWithStats& graph_with_stats,
-                                      OpFuncs&&... op_funcs) const {
-    ir::Graph* graph;
-    int stats;
-
-    std::tie(graph, stats) = graph_with_stats;
-
-    auto can_fuse = [this](Node* op1, Node* op2) -> bool {
-      return this->FindFuseOption(*op1, *op2) == FUSE_MKLDNN;
-    };
-    auto fuse_handle = HandleType{can_fuse, std::forward<OpFuncs>(op_funcs)...};
-
-    (*gpd)(graph, fuse_handle);
-
-    return std::make_pair(graph, stats + fuse_handle.get_stats());
-  }
-
-  struct IdentityFuseHandle {
-    IdentityFuseHandle(
-        const CanFuseFunc& can_fuse_func,
-        const IdentityConvFunc& get_node_from_conv_op,
-        const IdentityElementwiseAddFunc& get_node_from_elementwise_add_op,
-        const ResidualConnectionMKLDNNFusePass* pass);
-
-    void operator()(const GraphPatternDetector::subgraph_t& subgraph,
-                    Graph* graph);
-    int get_stats() const { return *fusion_stats; }
-
-   private:
-    std::shared_ptr<int> fusion_stats;
-    CanFuseFunc can_fuse_func;
-    IdentityConvFunc get_node_from_conv_op;
-    IdentityElementwiseAddFunc get_node_from_elementwise_add_op;
-    const ResidualConnectionMKLDNNFusePass* pass_;
-  };
-
-  struct ProjectionFuseHandle {
-    ProjectionFuseHandle(
-        const CanFuseFunc& can_fuse_func,
-        const ProjectionConvFunc& get_node_from_conv_x_op,
-        const ProjectionConvFunc& get_node_from_conv_y_op,
-        const ProjectionElementwiseAddFunc& get_node_from_elementwise_add_op,
-        const ResidualConnectionMKLDNNFusePass* pass);
-
-    void operator()(const GraphPatternDetector::subgraph_t& subgraph,
-                    Graph* graph);
-    int get_stats() const { return *fusion_stats; }
-
-   private:
-    std::shared_ptr<int> fusion_stats;
-    CanFuseFunc can_fuse_func;
-    ProjectionConvFunc get_node_from_conv_x_op;
-    ProjectionConvFunc get_node_from_conv_y_op;
-    ProjectionElementwiseAddFunc get_node_from_elementwise_add_op;
-    const ResidualConnectionMKLDNNFusePass* pass_;
-  };
-
  public:
   ResidualConnectionMKLDNNFusePass();
   virtual ~ResidualConnectionMKLDNNFusePass() {}
 
  protected:
-  void ApplyImpl(graph_ptr graph) const;
+  void ApplyImpl(ir::Graph* graph) const;
+
   static bool HasFusedActivation(Node* conv_node) {
     return !(conv_node->Op()
                  ->GetAttrIfExists<std::string>("fuse_activation")
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 371482b5343d6..f4358fb243f20 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -807,74 +807,74 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
   PrettyLogDetail("---    quantized %d matmul ops", quantize_matmul_count);
 }
 
-void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const {
+void CPUQuantizePass::QuantizeElementwise(
+    Graph* graph, const std::string elementwise_type) const {
   GraphPatternDetector gpd;
   auto pattern = gpd.mutable_pattern();
-  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope_};
+  patterns::Elementwise elementwise_pattern{pattern, name_scope_};
 
-  elementwise_add_pattern(
-      pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()),
-      pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr()));
+  elementwise_pattern(
+      pattern->NewNode(elementwise_pattern.elementwise_x_repr()),
+      pattern->NewNode(elementwise_pattern.elementwise_y_repr()),
+      elementwise_type);
 
-  int quantize_elementwise_add_count = 0;
+  int quantize_elementwise_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(4) << "Quantize elementwise_add op";
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                              elementwise_add_pattern);
+    VLOG(4) << "Quantize " + elementwise_type + " op";
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
+                              elementwise_pattern);
 
     // skip if should not be quantized
-    if (!platform::HasOpINT8DataType(elementwise_add_op->Op())) {
-      LogQuantizationDisabled(elementwise_add_op);
+    if (!platform::HasOpINT8DataType(elementwise_op->Op())) {
+      LogQuantizationDisabled(elementwise_op);
       return;
     }
 
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
-                              elementwise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y,
-                              elementwise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_x, elementwise_x,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_y, elementwise_y,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
+                              elementwise_pattern);
 
     if (!AreScalesPresentForNodes(
-            {elementwise_add_x, elementwise_add_y, elementwise_add_out})) {
-      LogCannotQuantizeOp(elementwise_add_op,
+            {elementwise_x, elementwise_y, elementwise_out})) {
+      LogCannotQuantizeOp(elementwise_op,
                           "No scale available for the operator");
       return;
     }
 
     bool is_x_unsigned{false}, is_y_unsigned{false};
-    auto input_x_scale =
-        GetScaleValueForNode(elementwise_add_x, &is_x_unsigned);
-    auto input_y_scale =
-        GetScaleValueForNode(elementwise_add_y, &is_y_unsigned);
+    auto input_x_scale = GetScaleValueForNode(elementwise_x, &is_x_unsigned);
+    auto input_y_scale = GetScaleValueForNode(elementwise_y, &is_y_unsigned);
 
     // TODO(sfraczek): add support for different signness
     if (is_x_unsigned != is_y_unsigned) {
-      LogCannotQuantizeOp(elementwise_add_op,
-                          "ElementwiseAdd inputs must be of the same type.");
+      LogCannotQuantizeOp(elementwise_op,
+                          "Elementwise inputs must be of the same type.");
       return;
     }
 
-    QuantizeInput(g, elementwise_add_op, elementwise_add_x, "X", input_x_scale,
+    QuantizeInput(g, elementwise_op, elementwise_x, "X", input_x_scale,
                   is_x_unsigned, "Scale_x");
-    QuantizeInput(g, elementwise_add_op, elementwise_add_y, "Y", input_y_scale,
+    QuantizeInput(g, elementwise_op, elementwise_y, "Y", input_y_scale,
                   is_y_unsigned, "Scale_y");
 
     bool is_output_unsigned{false};
     auto output_scale =
-        GetScaleValueForNode(elementwise_add_out, &is_output_unsigned);
+        GetScaleValueForNode(elementwise_out, &is_output_unsigned);
 
-    DequantizeOutput(g, elementwise_add_op, elementwise_add_out, "Out",
-                     output_scale, is_output_unsigned, "Scale_out");
+    DequantizeOutput(g, elementwise_op, elementwise_out, "Out", output_scale,
+                     is_output_unsigned, "Scale_out");
 
-    ++quantize_elementwise_add_count;
+    ++quantize_elementwise_count;
   };
   gpd(graph, handler);
-  AddStatis(quantize_elementwise_add_count);
+  AddStatis(quantize_elementwise_count);
 
-  PrettyLogDetail("---    quantized %d elementwise_add ops",
-                  quantize_elementwise_add_count);
+  PrettyLogDetail("---    quantized %d %s ops", quantize_elementwise_count,
+                  elementwise_type);
 }
 
 void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const {
@@ -1146,7 +1146,8 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizeFc(graph);
   QuantizeReshape(graph);
   QuantizeMatmul(graph);
-  QuantizeElementwiseAdd(graph);
+  QuantizeElementwise(graph, "elementwise_add");
+  QuantizeElementwise(graph, "elementwise_mul");
   QuantizeFusionGru(graph);
   QuantizeMultiGru(graph);
   QuantizeFusionLSTM(graph);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index 412c4e40a01d5..3a286264e41ff 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -57,7 +57,8 @@ class CPUQuantizePass : public FusePassBase {
   void QuantizeTranspose(Graph* graph) const;
   void QuantizeReshape(Graph* graph) const;
   void QuantizeMatmul(Graph* graph) const;
-  void QuantizeElementwiseAdd(Graph* graph) const;
+  void QuantizeElementwise(Graph* graph,
+                           const std::string elementwise_type) const;
   void QuantizeFusionGru(Graph* graph) const;
   void QuantizeMultiGru(Graph* graph) const;
   void QuantizeFusionLSTM(Graph* graph) const;
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index 889417b78c864..22000865948d6 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -90,7 +90,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetAttr("Scale_x", 1.0f);
     op->SetAttr("Scale_y", 1.0f);
     op->SetAttr("Scale_out", 1.0f);
-  } else if (type == "elementwise_add") {
+  } else if (type == "elementwise_add" || type == "elementwise_mul") {
     op->SetInput("X", {inputs[0]});
     if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
     op->SetOutput("Out", {outputs[0]});
@@ -167,7 +167,8 @@ void CheckScales(const OpDesc* op, float scale, float shift) {
               scale);
     scale_names.push_back("Scale_in");
     scale_names.push_back("Scale_out");
-  } else if (type == "matmul" || type == "elementwise_add") {
+  } else if (type == "matmul" || type == "elementwise_add" ||
+             type == "elementwise_mul") {
     scale_names.push_back("Scale_x");
     scale_names.push_back("Scale_y");
     scale_names.push_back("Scale_out");
@@ -546,46 +547,77 @@ TEST(CpuQuantizePass, matmul_not_quantized) {
            expected_operators, added_nodes, 1.0f);
 }
 
-static const std::initializer_list<std::string> variable_names_elementwise_add =
-    {"a", "b", "c", "d", "e", "f"};
+static const std::initializer_list<std::string> variable_names_elementwise = {
+    "a", "b", "c", "d", "e", "f"};
 
-ProgramDesc BuildProgramDescElementwiseAdd() {
+ProgramDesc BuildProgramDescElementwise(const std::string elementwise_type,
+                                        const std::string elementwise_name) {
   ProgramDesc prog;
-  for (auto& v : variable_names_elementwise_add) {
+  for (auto& v : variable_names_elementwise) {
     prog.MutableBlock(0)->Var(v);
   }
   SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
   SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true);
-  SetOp(&prog, "elementwise_add", "ElementwiseAdd", {"b", "d"}, {"e"}, true,
+  SetOp(&prog, elementwise_type, elementwise_name, {"b", "d"}, {"e"}, true,
         "int8");
   SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");
 
   return prog;
 }
 
-TEST(CpuQuantizePass, elementwise_add) {
+void TestElementwise(const std::string elementwise_type,
+                     const std::string elementwise_name) {
   // 2 Quant + 2 IN + 1 DeQuant + 1 OUT
   int added_nodes = 6;
   std::unordered_map<std::string, int> expected_operators = {
-      {"elementwise_add", 1}, {"quantize", 2}, {"dequantize", 3}};
-  MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add,
-           expected_operators, added_nodes, SCALE * S8_MAX);
+      {elementwise_type, 1}, {"quantize", 2}, {"dequantize", 3}};
+  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+           variable_names_elementwise, expected_operators, added_nodes,
+           SCALE * S8_MAX);
 }
 
-TEST(CpuQuantizePass, elementwise_add_output_scale_missing) {
+void TestElementwiseOutputScaleMissing(const std::string elementwise_type,
+                                       const std::string elementwise_name) {
   int added_nodes = 0;
   std::unordered_map<std::string, int> expected_operators = {
-      {"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}};
-  MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add,
-           expected_operators, added_nodes, 1.f, 1.f, "e");
+      {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}};
+  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+           variable_names_elementwise, expected_operators, added_nodes, 1.f,
+           1.f, "e");
 }
 
-TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) {
+void TestElementwiseUnsignedAndSignedInput(const std::string elementwise_type,
+                                           const std::string elementwise_name) {
   int added_nodes = 0;
   std::unordered_map<std::string, int> expected_operators = {
-      {"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}};
-  MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add,
-           expected_operators, added_nodes, 1.f, 1.f, "", "b");
+      {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}};
+  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+           variable_names_elementwise, expected_operators, added_nodes, 1.f,
+           1.f, "", "b");
+}
+
+TEST(CpuQuantizePass, elementwise_add) {
+  TestElementwise("elementwise_add", "ElementwiseAdd");
+}
+
+TEST(CpuQuantizePass, elementwise_add_output_scale_missing) {
+  TestElementwiseOutputScaleMissing("elementwise_add", "ElementwiseAdd");
+}
+
+TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) {
+  TestElementwiseUnsignedAndSignedInput("elementwise_add", "ElementwiseAdd");
+}
+
+TEST(CpuQuantizePass, elementwise_mul) {
+  TestElementwise("elementwise_mul", "ElementwiseMul");
+}
+
+TEST(CpuQuantizePass, elementwise_mul_output_scale_missing) {
+  TestElementwiseOutputScaleMissing("elementwise_mul", "ElementwiseMul");
+}
+
+TEST(CpuQuantizePass, elementwise_mul_unsigned_and_signed_input) {
+  TestElementwiseUnsignedAndSignedInput("elementwise_mul", "ElementwiseMul");
 }
 
 const std::vector<std::string> churn_out_vars(ProgramDesc* prog,
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
index 5f74b61ee86aa..3b883dac9782a 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -26,10 +26,10 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Marks operators which are to be quantized.";
   std::unordered_set<std::string> supported_op_types =
       std::unordered_set<std::string>(
-          {"concat", "conv2d", "depthwise_conv2d", "elementwise_add", "fc",
-           "matmul", "nearest_interp", "nearest_interp_v2", "pool2d",
-           "prior_box", "reshape2", "transpose2", "fusion_gru", "fusion_lstm",
-           "multi_gru", "slice"});
+          {"concat", "conv2d", "depthwise_conv2d", "elementwise_add",
+           "elementwise_mul", "fc", "matmul", "nearest_interp",
+           "nearest_interp_v2", "pool2d", "prior_box", "reshape2", "transpose2",
+           "fusion_gru", "fusion_lstm", "multi_gru", "slice"});
   const auto& excluded_ids_list =
       Get<std::unordered_set<int>>("quantize_excluded_op_ids");
   const auto& op_types_list =
diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc
new file mode 100644
index 0000000000000..b7f7a8071d214
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void ElementwiseActivationOneDNNPass::ApplyImpl(Graph *graph) const {
+  std::vector<std::string> act_types = {
+      "relu", "tanh", "leaky_relu", "swish", "hardswish", "sqrt",
+      "abs",  "clip", "gelu",       "relu6", "sigmoid"};
+  std::vector<std::string> elt_types = {"elementwise_add", "elementwise_sub",
+                                        "elementwise_mul"};
+
+  for (const auto &elt_type : elt_types)
+    for (const auto &act_type : act_types) {
+      std::unordered_map<std::string, std::string> attr_map;
+
+      if (act_type == "swish")
+        attr_map.emplace("beta", "activation_alpha");
+      else if (act_type == "relu6")
+        attr_map.emplace("threshold", "activation_alpha");
+      else if (act_type == "clip") {
+        attr_map.emplace("min", "activation_alpha");
+        attr_map.emplace("max", "activation_beta");
+      } else {
+        attr_map.emplace("alpha", "activation_alpha");
+        attr_map.emplace("beta", "activation_beta");
+      }
+      FuseElementwiseAct(graph, elt_type, act_type, attr_map);
+    }
+}
+
+void ElementwiseActivationOneDNNPass::FuseElementwiseAct(
+    Graph *graph, const std::string &elt_type, const std::string &act_type,
+    const std::unordered_map<std::string, std::string> &attr_map) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  FusePassBase::Init("elementwise_act", graph);
+
+  GraphPatternDetector gpd;
+  auto *elementwise_input = gpd.mutable_pattern()
+                                ->NewNode(elt_type + "_act/elementwise_input")
+                                ->AsInput()
+                                ->assert_is_op_input(elt_type, "X");
+  patterns::ElementwiseActivation elementwise_act_pattern(gpd.mutable_pattern(),
+                                                          elt_type + "_act");
+  elementwise_act_pattern(elementwise_input, elt_type, act_type);
+
+  int found_elementwise_activation_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "Fuse " << elt_type << " with activation op.";
+    // Elementwise output
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
+                              elementwise_act_pattern);
+    // ACT output
+    GET_IR_NODE_FROM_SUBGRAPH(activation_out, activation_out,
+                              elementwise_act_pattern);
+    // ops
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise, elementwise,
+                              elementwise_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(activation, activation, elementwise_act_pattern);
+
+    auto *elementwise_op = elementwise->Op();
+
+    if (elementwise_op->HasAttr("use_mkldnn")) {
+      const std::string wo_elt_type =
+          "The " + elt_type;  // Workaround for PP error message checking.
+      PADDLE_ENFORCE_EQ(
+          BOOST_GET_CONST(bool, elementwise_op->GetAttr("use_mkldnn")), true,
+          platform::errors::PreconditionNotMet(
+              wo_elt_type + "+Act fusion may happen only when oneDNN library "
+                            "is used."));
+    }
+
+    auto *activation_op = activation->Op();
+    for (const auto &attr : attr_map) {
+      if (activation_op->HasAttr(attr.first)) {
+        elementwise_op->SetAttr(attr.second,
+                                activation_op->GetAttr(attr.first));
+      }
+    }
+
+    if (act_type == "gelu" && activation_op->HasAttr("approximate") &&
+        BOOST_GET_CONST(bool, activation_op->GetAttr("approximate")))
+      elementwise_op->SetAttr("activation_type", std::string("gelu_tanh"));
+    else
+      elementwise_op->SetAttr("activation_type", act_type);
+
+    elementwise_op->SetOutput("Out", {activation_out->Name()});
+
+    IR_OP_VAR_LINK(elementwise, activation_out);
+    GraphSafeRemoveNodes(g, {activation, elementwise_out});
+    found_elementwise_activation_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_elementwise_activation_count);
+  PrettyLogDetail("---    fused %d %s with %s activation",
+                  found_elementwise_activation_count, elt_type, act_type);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(elt_act_mkldnn_fuse_pass,
+              paddle::framework::ir::ElementwiseActivationOneDNNPass);
+REGISTER_PASS_CAPABILITY(elt_act_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("elementwise_add", 1)
+            .LE("elementwise_sub", 1)
+            .LE("elementwise_mul", 1)
+            .LE("relu", 0)
+            .LE("tanh", 0)
+            .LE("leaky_relu", 1)
+            .LE("swish", 0)
+            .LE("hard_swish", 0)
+            .LE("sqrt", 0)
+            .LE("abs", 0)
+            .LE("clip", 1)
+            .LE("gelu", 0)
+            .LE("relu6", 0)
+            .LE("sigmoid", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h
new file mode 100644
index 0000000000000..b8b7d06a82850
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * \brief   Fuse the Elementwise and activation operators into single
+ * OneDNN's Elementwise with post-op.
+ */
+class ElementwiseActivationOneDNNPass : public FusePassBase {
+ public:
+  virtual ~ElementwiseActivationOneDNNPass() {}
+
+ protected:
+  void ApplyImpl(Graph *graph) const override;
+
+  void FuseElementwiseAct(
+      Graph *graph, const std::string &elt_types, const std::string &act_types,
+      const std::unordered_map<std::string, std::string> &attr_map) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
index 11190309814e7..17663ecf6baa3 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
@@ -28,12 +28,13 @@
 
 USE_OP_ITSELF(batch_norm);
 USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN);
-USE_OP(conv2d_transpose);
+USE_OP_ITSELF(conv2d_transpose);
 USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
-USE_OP(gelu);
+USE_OP_ITSELF(gelu);
 USE_OP_DEVICE_KERNEL(gelu, MKLDNN);
+PD_DECLARE_ARG_MAPPING_FN(gelu);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
index 796aa4039c9e8..7df957b2c0eca 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
@@ -15,8 +15,10 @@
 #include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h"
 
 #include <gtest/gtest.h>
-#include <boost/logic/tribool.hpp>
 #include <unordered_set>
+
+#include <boost/logic/tribool.hpp>
+
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -24,12 +26,13 @@ USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
-USE_OP(leaky_relu);
+USE_OP_ITSELF(leaky_relu);
 USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN);
-USE_OP(gelu);
+USE_OP_ITSELF(gelu);
 USE_OP_ITSELF(relu);
-USE_OP(tanh);
+USE_OP_ITSELF(tanh);
 USE_OP_DEVICE_KERNEL(tanh, MKLDNN);
+PD_DECLARE_ARG_MAPPING_FN(gelu);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index 7e61d6ae4248b..8c51c278d4872 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -125,6 +125,7 @@ class Node {
   // Only use this for auto parallel.
   // A node does not have original desc if the return is zero.
   uint64_t OriginalDescId() const { return original_desc_id_; }
+  int GraphId() const { return graph_id_; }
 
   bool IsOp() const { return type_ == Type::kOperation; }
   bool IsVar() const { return type_ == Type::kVariable; }
@@ -246,10 +247,12 @@ class Node {
   // Store the original id of var desc or op desc.
   // Only use this for auto parallel.
   uint64_t original_desc_id_{0};
+  int graph_id_{-1};
 
  private:
   // ID can only set by a Graph.
   void SetId(int id) { id_ = id; }
+  void SetGraphId(int graph_id) { graph_id_ = graph_id; }
 
   // desc_order can only set by a Graph when constructing a Graph from a
   // BlockDesc.
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index ece4815858640..f30d1ea1b83dd 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -41,6 +41,7 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
 void NaiveExecutor::Run() {
 #ifdef PADDLE_WITH_MKLDNN
   platform::AttachPointerHashToMKLDNNKey(this, place_);
+  platform::RegisterModelLayout(ops_, place_);
 #endif
   platform::ScopedFlushDenormal flush;
   for (auto &op : ops_) {
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index 62d87b6917e40..7fe1852f7396c 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -31,14 +31,14 @@ USE_OP(slice);
 USE_OP(concat);
 USE_OP(matmul);
 USE_OP_ITSELF(elementwise_add);
-USE_OP(sigmoid);
-USE_OP(tanh);
+USE_OP_ITSELF(sigmoid);
+USE_OP_ITSELF(tanh);
 USE_OP(elementwise_mul);
 USE_OP(softmax_with_cross_entropy);
 USE_OP_ITSELF(reduce_mean);
 USE_OP_ITSELF(reduce_sum);
-USE_OP(reduce_sum_grad);
-USE_OP(reduce_mean_grad);
+USE_OP_ITSELF(reduce_sum_grad);
+USE_OP_ITSELF(reduce_mean_grad);
 USE_OP_ITSELF(reshape2_grad);
 USE_OP(softmax_with_cross_entropy_grad);
 USE_OP_ITSELF(elementwise_add_grad);
@@ -46,9 +46,9 @@ USE_OP(matmul_grad);
 USE_OP(square);
 USE_OP(transpose2_grad);
 USE_OP(concat_grad);
-USE_OP(elementwise_mul_grad);
-USE_OP(sigmoid_grad);
-USE_OP(tanh_grad);
+USE_OP_ITSELF(elementwise_mul_grad);
+USE_OP_ITSELF(sigmoid_grad);
+USE_OP_ITSELF(tanh_grad);
 USE_OP(sum);
 USE_OP(slice_grad);
 USE_OP(lookup_table_grad);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index f8e30c1ee294e..42fbeb5d29ce4 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -628,10 +628,12 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
 
 bool OpSupportGPU(const std::string& op_type) {
   // check in new Function kernel first
+  bool has_phi_kernel = false;
   auto& kernel_factory = phi::KernelFactory::Instance();
   auto kernel_key_map =
       kernel_factory.SelectKernelMap(phi::TransToPhiKernelName(op_type));
   for (auto& kernel : kernel_key_map) {
+    has_phi_kernel = true;
     if (platform::is_gpu_place(phi::TransToPhiPlace(kernel.first.backend()))) {
       return true;
     }
@@ -639,12 +641,19 @@ bool OpSupportGPU(const std::string& op_type) {
 
   auto& all_kernels = OperatorWithKernel::AllOpKernels();
   auto it = all_kernels.find(op_type);
-  if (it == all_kernels.end()) {
-    // All control operator must support GPU
-    return true;
-  }
-  for (auto& kern_pair : it->second) {
-    if (platform::is_gpu_place(kern_pair.first.place_)) {
+  if (it != all_kernels.end()) {
+    for (auto& kern_pair : it->second) {
+      if (platform::is_gpu_place(kern_pair.first.place_)) {
+        return true;
+      }
+    }
+  } else {
+    if (has_phi_kernel) {
+      // if has phi kernel, but not find phi gpu kernel and fluid gpu kernel,
+      // this op doesn't support GPU
+      return false;
+    } else {
+      // All control operator must support GPU
       return true;
     }
   }
@@ -1456,7 +1465,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
     kernel_iter = kernels.find(expected_kernel_key);
   }
 #endif
-#ifdef PADDLE_WITH_XPU
+
+#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
   if (platform::is_xpu_place(expected_kernel_key.place_) &&
       (kernel_iter == kernels.end() ||
        !paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
@@ -1470,17 +1480,36 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
 #endif
 
 #ifdef PADDLE_WITH_XPU_KP
-  bool use_xpu_kp_kernel_rt =
-      FLAGS_run_kp_kernel &&
-      paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key);
-  bool use_xpu_kp_kernel_debug =
-      paddle::platform::is_in_xpu_kpwhite_list(type_);
-  if (platform::is_xpu_place(expected_kernel_key.place_) &&
-      (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
-    expected_kernel_key.library_type_ = LibraryType::kKP;
-    kernel_iter = kernels.find(expected_kernel_key);
-    VLOG(3) << "using XPU KP kernel: " << type_
-            << ", using_kernel_key:" << expected_kernel_key;
+  if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) {
+    bool use_xpu_kp_kernel_rt =
+        FLAGS_run_kp_kernel &&
+        paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key);
+    bool use_xpu_kp_kernel_debug =
+        paddle::platform::is_in_xpu_kpwhite_list(type_);
+    if (use_xpu_kp_kernel_rt) {
+      VLOG(3) << "xpu_kp using rt mode ";
+    }
+    if (use_xpu_kp_kernel_debug) {
+      VLOG(3) << "xpu_kp using debug mode ";
+    }
+    bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
+    if (is_xpu_kp_support) {
+      expected_kernel_key.library_type_ = LibraryType::kKP;
+      kernel_iter = kernels.find(expected_kernel_key);
+      VLOG(3) << "using XPU KP kernel: " << type_
+              << ", using_kernel_key:" << expected_kernel_key;
+    }
+    bool is_xpu_unsupport =
+        (!paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
+         paddle::platform::is_in_xpu_black_list(type_));
+    if (!is_xpu_kp_support &&
+        (kernel_iter == kernels.end() || is_xpu_unsupport)) {
+      VLOG(3) << "missing XPU kernel: " << type_
+              << ", expected_kernel_key:" << expected_kernel_key
+              << ", fallbacking to CPU one!";
+      expected_kernel_key.place_ = platform::CPUPlace();
+      kernel_iter = kernels.find(expected_kernel_key);
+    }
   }
 #endif
 
@@ -2083,16 +2112,25 @@ void OperatorWithKernel::BuildPhiKernelContext(
       auto* var = ins_vector[offset];
       if (var->IsType<framework::LoDTensor>()) {
         tensor_in = &(var->Get<framework::LoDTensor>());
+        pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
       } else if (var->IsType<phi::SelectedRows>()) {
         tensor_in = &(var->Get<phi::SelectedRows>());
+        pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        paddle::SmallVector<const phi::TensorBase*> tensor_vector;
+        auto& tensor_array = var->Get<framework::LoDTensorArray>();
+        for (auto& t : tensor_array) {
+          tensor_vector.emplace_back(&t);
+        }
+        pt_kernel_context->EmplaceBackInputsWithoutSetRange(tensor_vector);
+        end_idx += tensor_array.size() - 1;
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported input `%s` type when call pt kernel.",
             framework::ToTypeName(var->Type())));
       }
-
-      pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
     }
+    // Note: here cannot deal with vector<LoDTensorArray> input
     pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
   VLOG(4) << "Done inputs";
@@ -2120,22 +2158,33 @@ void OperatorWithKernel::BuildPhiKernelContext(
     for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
       phi::TensorBase* tensor_out = nullptr;
       auto* var = outs_vector[offset];
-
       if (var) {
         if (var->template IsType<framework::LoDTensor>()) {
           tensor_out = var->template GetMutable<framework::LoDTensor>();
+          pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
         } else if (var->template IsType<phi::SelectedRows>()) {
           tensor_out = var->template GetMutable<phi::SelectedRows>();
+          pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
+        } else if (var->template IsType<framework::LoDTensorArray>()) {
+          paddle::SmallVector<phi::TensorBase*> tensor_vector;
+          auto* tensor_array =
+              var->template GetMutable<framework::LoDTensorArray>();
+          // Note: If the input LoDTensorArray size is 0, the output
+          // LoDTensorArray is also 0
+          for (auto& t : *tensor_array) {
+            tensor_vector.emplace_back(&t);
+          }
+          pt_kernel_context->EmplaceBackOutputsWithoutSetRange(tensor_vector);
+          end_idx += tensor_array->size() - 1;
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
               "Unsupported output `%s` type when call pt kernel.",
               framework::ToTypeName(var->Type())));
         }
+      } else {
+        pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
       }
-
-      pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
-
     pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
   VLOG(4) << "Done outputs";
@@ -2250,42 +2299,67 @@ void OperatorWithKernel::BuildPhiKernelContext(
       }
     } else {
       // TODO(chenweihang): support other attrs later
-      auto& attr = Attrs().at(attr_names[i]);
+      auto attr_it = attrs_.find(attr_names[i]);
       if (attr_defs[i].type_index == std::type_index(typeid(int))) {
-        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
+        if (attr_it == attrs_.end()) {
+          auto in_it = ctx.inputs.find(attr_names[i]);
+          if (in_it != ctx.inputs.end()) {
+            // get data from input
+            auto val = experimental::MakePhiScalarFromVar(*(in_it->second[0]));
+            int32_t val_int = val.template to<int32_t>();
+            pt_kernel_context->EmplaceBackAttr(val_int);
+          } else {
+            PADDLE_THROW(platform::errors::NotFound(
+                "can not find attribute `%s` both in attribute and input ",
+                attr_names[i]));
+          }
+        } else {
+          pt_kernel_context->EmplaceBackAttr(
+              BOOST_GET_CONST(int, attr_it->second));
+        }
       } else if (attr_defs[i].type_index == std::type_index(typeid(float))) {
-        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(float, attr));
+        pt_kernel_context->EmplaceBackAttr(
+            BOOST_GET_CONST(float, attr_it->second));
       } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
-        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
+        pt_kernel_context->EmplaceBackAttr(
+            BOOST_GET_CONST(bool, attr_it->second));
       } else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) {
-        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr));
+        pt_kernel_context->EmplaceBackAttr(
+            BOOST_GET_CONST(int64_t, attr_it->second));
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::string))) {
-        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr));
+        pt_kernel_context->EmplaceBackAttr(
+            BOOST_GET_CONST(std::string, attr_it->second));
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(phi::DataType))) {
         auto data_type = paddle::framework::TransToPhiDataType(
             static_cast<framework::proto::VarType::Type>(
-                BOOST_GET_CONST(int, attr)));
+                BOOST_GET_CONST(int, attr_it->second)));
         pt_kernel_context->EmplaceBackAttr(data_type);
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int64_t>))) {
-        if (std::type_index(attr.type()) ==
+        if (std::type_index(attr_it->second.type()) ==
             std::type_index(typeid(std::vector<int64_t>))) {
           pt_kernel_context->EmplaceBackAttr(
-              BOOST_GET_CONST(std::vector<int64_t>, attr));
-        } else if (std::type_index(attr.type()) ==
+              BOOST_GET_CONST(std::vector<int64_t>, attr_it->second));
+        } else if (std::type_index(attr_it->second.type()) ==
                    std::type_index(typeid(std::vector<int>))) {
           // Emplace Back Attr according to the type of Phi_Kernel args.
-          const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
+          const auto& vector_int_attr =
+              BOOST_GET_CONST(std::vector<int>, attr_it->second);
           const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
                                                        vector_int_attr.end());
           pt_kernel_context->EmplaceBackAttr(vector_int64_attr);
         }
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int32_t>))) {
-        const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
+        const auto& vector_int_attr =
+            BOOST_GET_CONST(std::vector<int>, attr_it->second);
         pt_kernel_context->EmplaceBackAttr(vector_int_attr);
+      } else if (attr_defs[i].type_index ==
+                 std::type_index(typeid(std::vector<std::string>))) {
+        pt_kernel_context->EmplaceBackAttr(
+            BOOST_GET_CONST(std::vector<std::string>, attr_it->second));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported cast op attribute `%s` when construct "
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 1a1171f1dba4d..6f68c261d2b24 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -483,6 +483,10 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
     return ctx_.InputVar(name)->IsType<phi::SelectedRows>();
   }
 
+  bool IsDenseTensorVectorInput(const std::string& name) const override {
+    return ctx_.InputVar(name)->IsType<framework::LoDTensorArray>();
+  }
+
   bool IsDenseTensorOutput(const std::string& name) const override {
     return ctx_.OutputVar(name)->IsType<framework::LoDTensor>();
   }
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc
index 23cb653fef22a..7a7a7b2798f59 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc
@@ -45,8 +45,8 @@ Program CreateAddProgram() {
   NetBuilder builder("net_builder");
   auto a = builder.CreateInput(Float(32), {M, N});
   auto b = builder.CreateInput(Float(32), {M, N});
-  auto c = builder.add(a, b);
-  auto d = builder.add(a, c);
+  auto c = builder.Add(a, b);
+  auto d = builder.Add(a, c);
   auto program = builder.Build();
 
   return program;
@@ -116,8 +116,8 @@ TEST(net_build, program_execute_fc) {
   auto w = builder.CreateInput(Float(32), {N, K}, "W");  // weight
   auto b = builder.CreateInput(Float(32), {N}, "B");     // bias
 
-  auto mul_out = builder.mul(a, w, 2, 1);
-  auto add_out = builder.add(mul_out, b);
+  auto mul_out = builder.Mul(a, w, 2, 1);
+  auto add_out = builder.Add(mul_out, b);
   auto program = builder.Build();
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 10ceae62dccbb..e8cd84248ea85 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -79,18 +79,6 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#ifdef PADDLE_WITH_IPU
-  else if (platform::is_ipu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_ipu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (platform::is_ipu_place(src_place) &&
-             platform::is_ipu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   else if (platform::is_custom_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
@@ -390,6 +378,29 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
         "Copying from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
+#ifdef PADDLE_WITH_IPU
+  else if (platform::is_ipu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_ipu_place(dst_place)) {
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  }
+  else if (platform::is_ipu_place(src_place) &&  // NOLINT
+           platform::is_ipu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  }
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copying from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 }
 
 template <typename TENSOR>
@@ -447,27 +458,15 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#ifdef PADDLE_WITH_IPU
-  else if (platform::is_ipu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
-             platform::is_ipu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else {  // NOLINT
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Copy from %s to %s is not supported.", src_place, dst_place));
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   else if (platform::is_custom_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {     /* custom_device -> cpu*/
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
+  }                                                // NOLINT
   else if (platform::is_cpu_place(src_place) &&    // NOLINT
            platform::is_custom_place(dst_place)) { /* cpu -> custom_device*/
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
+  }                                                 // NOLINT
   else if (platform::is_custom_place(src_place) &&  // NOLINT
            platform::is_custom_place(
                dst_place)) { /* custom_device -> custom_device*/
@@ -483,11 +482,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   else if (platform::is_xpu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  }
+  }                                              // NOLINT
   else if (platform::is_cpu_place(src_place) &&  // NOLINT
            platform::is_xpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  }
+  }                                              // NOLINT
   else if (platform::is_xpu_place(src_place) &&  // NOLINT
            platform::is_xpu_place(dst_place)) {
     if (src_ptr == dst_ptr) {
@@ -502,7 +501,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
       auto xpu_ctx = platform::DeviceContextPool::Instance().Get(xpu_dst_place);
       xpu_ctx->Wait();
     }
-  }
+  }       // NOLINT
   else {  // NOLINT
     PADDLE_THROW(platform::errors::Unimplemented(
         "Copy from %s to %s is not supported.", src_place, dst_place));
@@ -601,6 +600,29 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
+#ifdef PADDLE_WITH_IPU
+  else if (platform::is_ipu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_ipu_place(dst_place)) {
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  }
+  else if (platform::is_ipu_place(src_place) &&  // NOLINT
+           platform::is_ipu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  }
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 }
 
 template <typename Predicate, typename DevCtx>
@@ -1224,8 +1246,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
   proto::VarType::TensorDesc desc;
   {  // int32_t size
      // proto buffer
-    int32_t size;
+    int32_t size = -1;
     is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    PADDLE_ENFORCE_EQ(is.good(), true, platform::errors::Unavailable(
+                                           "Cannot read tensor desc size"));
+    PADDLE_ENFORCE_GE(size, 0, platform::errors::InvalidArgument(
+                                   "Tensor desc size should >= 0"));
     std::unique_ptr<char[]> buf(new char[size]);
     is.read(reinterpret_cast<char*>(buf.get()), size);
     PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index 149202468be6c..7d60b7d26f3fb 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -124,7 +124,7 @@ AmpOperators::AmpOperators()
       OpSupportedInfos("GPU", paddle::framework::proto::VarType::BF16));
   unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(),
                                 unsupported_ops_gpu_bf16.end());
-// NOTE: GPU/NPU/XPU is compiled seperatly.
+// NOTE: GPU/NPU/XPU/MLU is compiled seperatly.
 #elif defined(PADDLE_WITH_ASCEND_CL)
   auto unsupported_ops_npu_fp16 = std::get<2>(
       OpSupportedInfos("NPU", paddle::framework::proto::VarType::FP16));
@@ -143,6 +143,15 @@ AmpOperators::AmpOperators()
       OpSupportedInfos("XPU", paddle::framework::proto::VarType::BF16));
   unsupported_bf16_ops_->insert(unsupported_ops_xpu_bf16.begin(),
                                 unsupported_ops_xpu_bf16.end());
+#elif defined(PADDLE_WITH_MLU)
+  auto unsupported_ops_mlu_fp16 = std::get<2>(
+      OpSupportedInfos("MLU", paddle::framework::proto::VarType::FP16));
+  unsupported_fp16_ops_->insert(unsupported_ops_mlu_fp16.begin(),
+                                unsupported_ops_mlu_fp16.end());
+  auto unsupported_ops_mlu_bf16 = std::get<2>(
+      OpSupportedInfos("MLU", paddle::framework::proto::VarType::BF16));
+  unsupported_bf16_ops_->insert(unsupported_ops_mlu_bf16.begin(),
+                                unsupported_ops_mlu_bf16.end());
 #endif
   VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " "
           << unsupported_fp16_ops_->size() << " "
@@ -209,7 +218,10 @@ inline bool NeedCast(const std::shared_ptr<VarType>& var) {
   auto data_type = GetDataType<VarType>(var);
   if (paddle::platform::is_gpu_place(place) ||
       paddle::platform::is_cuda_pinned_place(place) ||
-      paddle::platform::is_xpu_place(place)) {
+      paddle::platform::is_xpu_place(place) ||
+      paddle::platform::is_mlu_place(place) ||
+      paddle::platform::is_npu_place(place) ||
+      paddle::platform::is_npu_pinned_place(place)) {
     // CudaPinndePlace is added for varbase created by dataloader
     if (data_type == paddle::framework::proto::VarType::FP32 ||
         data_type == paddle::framework::proto::VarType::FP16 ||
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 7416d206fc43e..d7478b18dba06 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -389,6 +389,9 @@ static void PerformBackwardInplace(const std::string& op_type,
 }
 
 void BasicEngine::Execute() {
+  platform::RecordEvent backward_record_event(
+      "backward", platform::TracerEventType::Operator, 1);
+
   if (init_nodes_.empty()) {
     return;
   }
@@ -412,7 +415,7 @@ void BasicEngine::Execute() {
 
     for (auto& cur_op : *shared_cur_node) {
       platform::RecordEvent op_type_record_event(
-          cur_op.Type(), platform::TracerEventType::Operator, 1);
+          cur_op.Type() + " grad_node", platform::TracerEventType::Operator, 1);
 
       ++op_num;
 
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 12aa13bbacc3b..499cf4d8ad6d8 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -423,7 +423,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
   }
   if (data_type == framework::proto::VarType::BF16) {
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       return TensorAddImpl<platform::CUDADeviceContext, platform::bfloat16>(
           src_tensor, dst_tensor, place);
 #else
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 2317bfdd7c0d5..a427b9b819911 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -234,7 +234,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
   auto& kernels = kernels_iter->second;
   auto kernel_iter = kernels.find(expected_kernel_key);
 
-#ifdef PADDLE_WITH_XPU
+#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
   if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
       (kernel_iter == kernels.end() || is_xpu_unsupport)) {
     VLOG(3) << "missing XPU kernel: " << op.Type()
@@ -243,28 +243,36 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
     expected_kernel_key.place_ = platform::CPUPlace();
     kernel_iter = kernels.find(expected_kernel_key);
   }
-
 #endif
 
 #ifdef PADDLE_WITH_XPU_KP
-  bool use_xpu_kp_kernel_rt =
-      FLAGS_run_kp_kernel &&
-      paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key);
-  bool use_xpu_kp_kernel_debug =
-      paddle::platform::is_in_xpu_kpwhite_list(op.Type());
-  if (use_xpu_kp_kernel_rt) {
-    VLOG(3) << "xpu_kp using rt mode ";
-  }
-  if (use_xpu_kp_kernel_debug) {
-    VLOG(3) << "xpu_kp using debug mode ";
-  }
-  if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
-      (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
-    expected_kernel_key.place_ = platform::XPUPlace();
-    expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP;
-    kernel_iter = kernels.find(expected_kernel_key);
-    VLOG(3) << "using XPU KP kernel: " << op.Type()
-            << ", using_kernel_key:" << expected_kernel_key;
+  if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) {
+    bool use_xpu_kp_kernel_rt =
+        FLAGS_run_kp_kernel &&
+        paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key);
+    bool use_xpu_kp_kernel_debug =
+        paddle::platform::is_in_xpu_kpwhite_list(op.Type());
+    if (use_xpu_kp_kernel_rt) {
+      VLOG(3) << "xpu_kp using rt mode ";
+    }
+    if (use_xpu_kp_kernel_debug) {
+      VLOG(3) << "xpu_kp using debug mode ";
+    }
+    bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
+    if (is_xpu_kp_support) {
+      expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP;
+      kernel_iter = kernels.find(expected_kernel_key);
+      VLOG(3) << "using XPU KP kernel: " << op.Type()
+              << ", using_kernel_key:" << expected_kernel_key;
+    }
+    if (!is_xpu_kp_support &&
+        (kernel_iter == kernels.end() || is_xpu_unsupport)) {
+      VLOG(3) << "missing XPU kernel: " << op.Type()
+              << ", expected_kernel_key:" << expected_kernel_key
+              << ", fallbacking to CPU one!";
+      expected_kernel_key.place_ = platform::CPUPlace();
+      kernel_iter = kernels.find(expected_kernel_key);
+    }
   }
 #endif
 
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index d7c0c8cc547e6..9daac181d57de 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -264,14 +264,23 @@ void BuildDygraphPhiKernelContext(
 
     size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second);
 
-    if ((it == ins.end()) &&
-        (input_defs[i].type_index ==
-         std::type_index(typeid(paddle::optional<const phi::DenseTensor&>)))) {
-      kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr);
-      auto end_idx = start_idx + 1;
-      kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
-      continue;
+    if (it == ins.end()) {
+      if (LIKELY(input_defs[i].type_index ==
+                 std::type_index(
+                     typeid(paddle::optional<const phi::DenseTensor&>)))) {
+        kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr);
+        auto end_idx = start_idx + 1;
+        kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
+        continue;
+      } else {
+        PADDLE_THROW(phi::errors::NotFound(
+            "Can not find input variable '%s' for %s OP, please check whether "
+            "the name setting in OpArgumentMapping is consistent with that in "
+            "OpMaker.",
+            input_names[i], pt_kernel_signature.name));
+      }
     }
+
     auto ins_vector = it->second;
     size_t end_idx = start_idx + ins_vector.size();
 
@@ -280,14 +289,23 @@ void BuildDygraphPhiKernelContext(
       auto& var = ins_vector[offset]->Var();
       if (var.template IsType<phi::DenseTensor>()) {
         tensor_in = &(var.template Get<phi::DenseTensor>());
+        kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
       } else if (var.template IsType<phi::SelectedRows>()) {
         tensor_in = &(var.template Get<phi::SelectedRows>());
+        kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
+      } else if (var.template IsType<framework::LoDTensorArray>()) {
+        paddle::SmallVector<const phi::TensorBase*> tensor_vector;
+        auto& tensor_array = var.template Get<framework::LoDTensorArray>();
+        for (auto& t : tensor_array) {
+          tensor_vector.emplace_back(&t);
+        }
+        kernel_ctx->EmplaceBackInputsWithoutSetRange(tensor_vector);
+        end_idx += tensor_array.size() - 1;
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported input `%s` type when call pt kernel.",
             framework::ToTypeName(var.Type())));
       }
-      kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
     }
     kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
@@ -317,22 +335,32 @@ void BuildDygraphPhiKernelContext(
       if (var) {
         if (var->template IsType<phi::DenseTensor>()) {
           tensor_out = var->template GetMutable<phi::DenseTensor>();
+          kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
         } else if (var->template IsType<phi::SelectedRows>()) {
           tensor_out = var->template GetMutable<phi::SelectedRows>();
+          kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
+        } else if (var->template IsType<framework::LoDTensorArray>()) {
+          paddle::SmallVector<phi::TensorBase*> tensor_vector;
+          auto* tensor_array =
+              var->template GetMutable<framework::LoDTensorArray>();
+          for (auto& t : *tensor_array) {
+            tensor_vector.emplace_back(&t);
+          }
+          kernel_ctx->EmplaceBackOutputsWithoutSetRange(tensor_vector);
+          end_idx += tensor_array->size() - 1;
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
               "Unsupported output `%s` type when call pt kernel.",
               framework::ToTypeName(var->Type())));
         }
+      } else {
+        kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
       }
-
-      kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
     kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
-    VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i];
     if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
       if (attrs.find(attr_names[i]) !=
           attrs.end()) {  // shape is in the attribute
@@ -410,6 +438,17 @@ void BuildDygraphPhiKernelContext(
             experimental::MakePhiScalarFromVar(ins_vector[0]->Var())));
       }
 
+    } else if (ins.find(attr_names[i]) != ins.end()) {
+      // deal tensor attr here
+      auto& ins_vector = ins.at(attr_names[i]);
+      auto tensor_attr =
+          experimental::MakePhiScalarFromVar(ins_vector[0]->Var());
+      if (attr_defs[i].type_index == std::type_index(typeid(int))) {
+        int val = tensor_attr.template to<int>();
+        kernel_ctx->EmplaceBackAttr(val);
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented("only support int here"));
+      }
     } else if (attr_defs[i].type_index ==
                std::type_index(typeid(std::vector<phi::Scalar>))) {
       auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
@@ -466,6 +505,7 @@ void BuildDygraphPhiKernelContext(
       }
     } else {
       // TODO(chenweihang): support other attrs later
+
       auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
       if (attr_defs[i].type_index == std::type_index(typeid(int))) {
         kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
@@ -501,6 +541,10 @@ void BuildDygraphPhiKernelContext(
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int>))) {
         kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::vector<int>, attr));
+      } else if (attr_defs[i].type_index ==
+                 std::type_index(typeid(std::vector<std::string>))) {
+        kernel_ctx->EmplaceBackAttr(
+            BOOST_GET_CONST(std::vector<std::string>, attr));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported cast op attribute `%s` when construct "
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index fec9afbf3b403..03fa46eab5367 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -1109,8 +1109,9 @@ void Reducer::FinalizeBackward() {
 
   if (find_unused_vars_each_step_) {
 // TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||      \
+    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \
+    defined(PADDLE_WITH_CNCL)
     ProcessUnusedDenseVars();
 #endif
     // Initialize local used vars
diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc
index 3ac2028790608..02a1689c23a3f 100644
--- a/paddle/fluid/imperative/tests/test_hooks.cc
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
@@ -24,6 +24,10 @@
 #include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
 
 namespace platform = paddle::platform;
 namespace framework = paddle::framework;
diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc
index 17cbe06748234..4cda3f32fdf3f 100644
--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ b/paddle/fluid/imperative/tests/test_prepare_op.cc
@@ -24,6 +24,13 @@
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/imperative/prepared_operator.h"
 #include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(split, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(relu, GPU, ALL_LAYOUT);
+#endif
 
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index d05036f7a12eb..f754c6fdd0ee7 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -28,6 +28,14 @@
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
+#endif
 
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
@@ -591,5 +599,5 @@ TEST(test_tracer, eager_tracer) {
 USE_OP(mul);
 USE_OP(mul_grad);
 USE_OP_ITSELF(reduce_sum);
-USE_OP(reduce_sum_grad);
+USE_OP_ITSELF(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 01c9d2847e0c8..d18c8e96c49b6 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -177,7 +177,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
                      paddle::framework::AttributeMap* passed_default_attrs_,
                      bool use_default_attr_map) {
   platform::RecordEvent op_type_record_event(
-      type, platform::TracerEventType::Operator, 1);
+      type + " trace_op", platform::TracerEventType::Operator, 1);
   platform::ScopedFlushDenormal flush;
   VLOG(1) << "Trace Op: " << type;
   if (FLAGS_use_mkldnn) {
@@ -297,19 +297,24 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
     program_desc_tracer_->InsertOp(type, new_ins, outs, attrs);
   }
 
-  if (ComputeRequiredGrad(new_ins, outs, trace_backward)) {
-    PADDLE_ENFORCE_EQ(
-        passed_default_attrs_, nullptr,
-        paddle::platform::errors::PermissionDenied(
-            "We expect passed_default_attrs_ is nullptr while "
-            "use_default_attr_map is true, however we got not null "
-            "passed_default_attrs_. Please check your usage of trace_op. "));
-    CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place,
-                     inplace_map);
-  } else {
-    VLOG(3) << "No Grad to track for Op: " << type;
+  {
+    platform::RecordEvent node_creation_record_event(
+        type + " node_creation", platform::TracerEventType::Operator, 1);
+
+    if (ComputeRequiredGrad(new_ins, outs, trace_backward)) {
+      PADDLE_ENFORCE_EQ(
+          passed_default_attrs_, nullptr,
+          paddle::platform::errors::PermissionDenied(
+              "We expect passed_default_attrs_ is nullptr while "
+              "use_default_attr_map is true, however we got not null "
+              "passed_default_attrs_. Please check your usage of trace_op. "));
+      CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place,
+                       inplace_map);
+    } else {
+      VLOG(3) << "No Grad to track for Op: " << type;
+    }
+    VLOG(6) << "Finish Trace Op: " << type;
   }
-  VLOG(6) << "Finish Trace Op: " << type;
 }
 
 template void Tracer::TraceOp<VarBase>(
@@ -385,8 +390,8 @@ bool Tracer::ComputeRequiredGrad(const NameTensorMap& ins,
 }
 
 phi::KernelSignature Tracer::GetExpectedKernelSignature(
-    const std::string& type, const NameVarBaseMap& ins,
-    const NameVarBaseMap& outs, framework::AttributeMap attrs) const {
+    const std::string& type, const NameTensorMap& ins,
+    const NameTensorMap& outs, framework::AttributeMap attrs) const {
   auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
   framework::RuntimeContext ctx({}, {});
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
@@ -401,7 +406,7 @@ phi::KernelSignature Tracer::GetExpectedKernelSignature(
       attr_checker == nullptr ? empty_attrs_map
                               : attr_checker->GetDefaultAttrMap();
   auto dygraph_exe_ctx =
-      imperative::DygraphExecutionContext<imperative::VarBase>(
+      imperative::DygraphExecutionContext<egr::EagerVariable>(
           *op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs,
           default_attrs);
   auto* opbase_with_kernel =
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index fd13fce6a6e17..f24961885c9b8 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -156,8 +156,8 @@ class Tracer {
   }
 
   phi::KernelSignature GetExpectedKernelSignature(
-      const std::string& type, const NameVarBaseMap& ins,
-      const NameVarBaseMap& outs, framework::AttributeMap attrs) const;
+      const std::string& type, const NameTensorMap& ins,
+      const NameTensorMap& outs, framework::AttributeMap attrs) const;
 
   paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists(
       const platform::Place& place);
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 26b8b9e8e17e0..5d0c3c98d2f61 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -45,6 +45,11 @@ add_subdirectory(api)
 set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
      zero_copy_tensor reset_tensor_array
         analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
+
+if(WITH_ONNXRUNTIME)
+  set(STATIC_INFERENCE_API ${STATIC_INFERENCE_API} onnxruntime_predictor)
+endif()
+
 #TODO(wilber, T8T9): Do we still need to support windows gpu static library?
 if(WIN32 AND WITH_GPU)
   cc_library(paddle_inference DEPS ${fluid_modules} ${phi_modules} ${STATIC_INFERENCE_API} ${utils_modules})
@@ -91,6 +96,13 @@ if (WITH_PSCORE)
     set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} fleet ps_service)
 endif ()
 
+if (WITH_ONNXRUNTIME)
+  set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS} 
+      ${CMAKE_CURRENT_SOURCE_DIR}/api/onnxruntime_predictor.cc
+  )
+  set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} onnxruntime_predictor)
+endif (WITH_ONNXRUNTIME)
+
 # Create shared inference library
 cc_library(paddle_inference_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
     DEPS ${SHARED_INFERENCE_DEPS})
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index a5c32164bf1a2..74e8ca3f229c6 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -188,6 +188,9 @@ struct Argument {
   DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
   DECL_ARGUMENT_FIELD(use_fc_padding, UseFcPadding, bool);
   DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
+  DECL_ARGUMENT_FIELD(use_gpu_fp16, UseGPUFp16, bool);
+  DECL_ARGUMENT_FIELD(gpu_fp16_disabled_op_types, GpuFp16DisabledOpTypes,
+                      std::unordered_set<std::string>);
 
   // Usually use for trt dynamic shape.
   // TRT will select the best kernel according to opt shape
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 796c86a3ad1ef..287c896e49bf2 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -189,6 +189,10 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new int(argument->dlnne_min_subgraph_size()));
       pass->Set("program",
                 new framework::ProgramDesc *(&argument->main_program()));
+    } else if (pass_name == "mixed_precision_configure_pass") {
+      pass->Set("gpu_fp16_disabled_op_types",
+                new std::unordered_set<std::string>(
+                    argument->gpu_fp16_disabled_op_types()));
     }
     if (pass_name == "lite_subgraph_pass") {
       bool lite_enable_int8 =
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index daa18d8c78bf8..614eea24a0e2e 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
 #include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -65,6 +66,26 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) {
 
 #else
 
+void IrParamsSyncAmongDevicesPass::GetVarNameToOpTypeMap(
+    const framework::ir::Graph &graph,
+    std::unordered_map<std::string, std::string> *var_name_op_type_map) {
+  std::vector<framework::ir::Node *> node_list =
+      framework::ir::TopologyVarientSort(
+          graph, static_cast<framework::ir::SortKind>(0));
+  for (auto *op_node : node_list) {
+    if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
+        op_node->Op()->Type() == "fetch")
+      continue;
+
+    for (auto *pre_node : op_node->inputs) {
+      if (pre_node->IsVar() && pre_node->Var()->Persistable()) {
+        var_name_op_type_map->insert(std::pair<std::string, std::string>(
+            pre_node->Var()->Name(), op_node->Op()->Type()));
+      }
+    }
+  }
+}
+
 void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
   // The parameters are on the cpu, therefore, synchronization is not necessary.
   if (!argument->use_gpu()) return;
@@ -102,6 +123,16 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
   if (with_dynamic_shape) {
     reserve_cpu_weights = true;
   }
+
+  bool mixed_precision_mode =
+      argument->Has("use_gpu_fp16") && argument->use_gpu_fp16();
+  std::unordered_map<std::string, std::string> var_name_op_type_map{};
+  std::unordered_set<std::string> blacklist{};
+  if (mixed_precision_mode) {
+    GetVarNameToOpTypeMap(graph, &var_name_op_type_map);
+    blacklist = argument->gpu_fp16_disabled_op_types();
+  }
+
   for (auto &var_name : all_vars) {
     if (std::count(repetitive_params.begin(), repetitive_params.end(),
                    var_name)) {
@@ -117,18 +148,29 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
         var->IsType<framework::Tensor>()) {
       auto *t = var->GetMutable<framework::LoDTensor>();
 
-      platform::CPUPlace cpu_place;
-      framework::LoDTensor temp_tensor;
-      temp_tensor.Resize(t->dims());
-      temp_tensor.mutable_data<float>(cpu_place);
-
-      // Copy the parameter data to a tmp tensor.
-      paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
-      // Reallocation the space on GPU
-      t->clear();
-
-      // Copy parameter data to newly allocated GPU space.
-      paddle::framework::TensorCopySync(temp_tensor, place, t);
+      bool is_float = t->dtype() == paddle::experimental::DataType::FLOAT32 ||
+                      t->dtype() == paddle::experimental::DataType::FLOAT64;
+      if (mixed_precision_mode &&
+          !blacklist.count(var_name_op_type_map[var_name]) && is_float) {
+        framework::Tensor half_tensor;
+        half_tensor.set_type(paddle::experimental::DataType::FLOAT16);
+        half_tensor.Resize(t->dims());
+        auto *half_data =
+            half_tensor.mutable_data<float16>(platform::CPUPlace());
+        for (int i = 0; i < t->numel(); i++) {
+          auto *data = t->mutable_data<float>(platform::CPUPlace());
+          half_data[i] = static_cast<float16>(data[i]);
+        }
+        t->clear();
+        paddle::framework::TensorCopySync(half_tensor, place, t);
+      } else {
+        platform::CPUPlace cpu_place;
+        framework::LoDTensor temp_tensor;
+        temp_tensor.Resize(t->dims());
+        paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
+        t->clear();
+        paddle::framework::TensorCopySync(temp_tensor, place, t);
+      }
     }
   }
 }
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
index d5e98ec886e65..f8209f051d534 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -38,7 +38,12 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
 #ifdef PADDLE_WITH_ASCEND_CL
   void CopyParamsToNpu(Argument *argument);
 #else
-  void CopyParamsToGpu(Argument *argument);
+
+  void GetVarNameToOpTypeMap(
+      const framework::ir::Graph& graph,
+      std::unordered_map<std::string, std::string>* var_name_op_type_map);
+
+  void CopyParamsToGpu(Argument* argument);
 #endif
 };
 
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 1f83e606c3fde..bdc16ef4c7907 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -49,8 +49,15 @@ if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
 endif()
 
-cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
-          zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils)
+if (WITH_ONNXRUNTIME)
+    cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
+              zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils onnxruntime paddle2onnx)
+    cc_library(onnxruntime_predictor SRCS onnxruntime_predictor.cc DEPS analysis_predictor)
+else (WITH_ONNXRUNTIME)
+    cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
+              zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils)
+endif (WITH_ONNXRUNTIME)
+
 
 cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api)
 
@@ -75,6 +82,16 @@ elseif (WIN32)
           ARGS --dirname=${WORD2VEC_MODEL_DIR})
 endif()
 
+if (WITH_ONNXRUNTIME)
+  if (NOT APPLE AND NOT WIN32)
+    cc_test(test_onnxruntime_predictor SRCS onnxruntime_predictor_tester.cc DEPS paddle_inference_shared
+            ARGS --dirname=${MOBILENETV2_MODEL_DIR})
+  elseif (WIN32)
+    cc_test(test_onnxruntime_predictor SRCS onnxruntime_predictor_tester.cc DEPS onnxruntime_predictor benchmark ${inference_deps}
+            ARGS --dirname=${MOBILENETV2_MODEL_DIR})
+  endif()
+endif()
+
 if(WITH_TESTING AND WITH_MKLDNN)
   if (NOT APPLE AND NOT WIN32)
     cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR})
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 9c33d70030645..d08d28a3f6233 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -83,6 +83,7 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path,
 
   Update();
 }
+
 void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
                                   int device_id) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -97,12 +98,26 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
 
   Update();
 }
+
 void AnalysisConfig::DisableGpu() {
   use_gpu_ = false;
 
   Update();
 }
 
+void AnalysisConfig::Exp_EnableUseGpuFp16(
+    std::unordered_set<std::string> op_list) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  use_gpu_fp16_ = true;
+  gpu_fp16_disabled_op_types_.insert(op_list.begin(), op_list.end());
+#else
+  LOG(ERROR) << "Please compile with gpu to Exp_EnableUseGpuFp16()";
+  use_gpu_fp16_ = false;
+#endif
+
+  Update();
+}
+
 void AnalysisConfig::DisableFCPadding() {
   use_fc_padding_ = false;
 
@@ -168,6 +183,33 @@ void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16, int ipu_replica_num,
   Update();
 }
 
+void AnalysisConfig::EnableONNXRuntime() {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  use_onnxruntime_ = true;
+#else
+  LOG(ERROR) << "Please compile with onnxruntime to EnableONNXRuntime()";
+  use_onnxruntime_ = false;
+#endif
+
+  Update();
+}
+
+void AnalysisConfig::DisableONNXRuntime() {
+  use_onnxruntime_ = false;
+  Update();
+}
+
+void AnalysisConfig::EnableORTOptimization() {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  enable_ort_optimization_ = true;
+#else
+  LOG(ERROR) << "Please compile with onnxruntime to EnableORTOptimization()";
+  enable_ort_optimization_ = false;
+#endif
+
+  Update();
+}
+
 AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 #define CP_MEMBER(member__) member__ = other.member__;
 
@@ -186,6 +228,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(use_cudnn_);
   CP_MEMBER(gpu_device_id_);
   CP_MEMBER(memory_pool_init_size_mb_);
+  CP_MEMBER(use_gpu_fp16_);
+  CP_MEMBER(gpu_fp16_disabled_op_types_);
 
   CP_MEMBER(enable_memory_optim_);
   // TensorRT related.
@@ -546,6 +590,20 @@ void AnalysisConfig::Update() {
 #endif
   }
 
+  if (use_gpu_fp16_) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (!enable_ir_optim_) {
+      LOG(ERROR) << "Exp_EnableUseGpuFp16() only works when IR optimization is "
+                    "enabled.";
+    } else if (!use_gpu()) {
+      LOG(ERROR)
+          << "Exp_EnableUseGpuFp16() only works when use_gpu is enabled.";
+    } else {
+      pass_builder()->Exp_EnableUseGpuFp16();
+    }
+#endif
+  }
+
   if (use_mkldnn_) {
 #ifdef PADDLE_WITH_MKLDNN
     if (!enable_ir_optim_) {
@@ -642,6 +700,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << params_file_;
 
   ss << use_gpu_;
+  ss << use_gpu_fp16_;
+  for (auto &item : gpu_fp16_disabled_op_types_) ss << item;
   ss << use_fc_padding_;
   ss << gpu_device_id_;
   ss << xpu_device_id_;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index df61b5103195d..a7caa3e369f80 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -50,8 +50,7 @@
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/utils/string/split.h"
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
@@ -65,6 +64,10 @@
 #include "paddle/fluid/inference/api/mkldnn_quantizer.h"
 #endif
 
+#ifdef PADDLE_WITH_ONNXRUNTIME
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+#endif
+
 #if PADDLE_WITH_TENSORRT
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
@@ -370,8 +373,7 @@ static void DisablePrepareDataOpt(
 }
 
 bool AnalysisPredictor::PrepareExecutor() {
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   if (config_.dist_config().use_dist_model()) {
     VLOG(3) << "use_dist_model is enabled, will init FleetExecutor.";
     return PrepareFleetExecutor();
@@ -389,8 +391,7 @@ bool AnalysisPredictor::PrepareExecutor() {
   return true;
 }
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 bool AnalysisPredictor::PrepareFleetExecutor() {
   VLOG(3) << "AnalysisPredictor::PrepareFleetExecutor()";
   if (config_.dist_config().nranks() > 1 && !CommInit()) {
@@ -868,6 +869,11 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_);
   }
 
+  if (config_.gpu_fp16_enabled()) {
+    argument_.SetUseGPUFp16(true);
+    argument_.SetGpuFp16DisabledOpTypes(config_.gpu_fp16_disabled_op_types_);
+  }
+
   if (config_.lite_engine_enabled()) {
     argument_.SetCpuMathLibraryNumThreads(
         config_.cpu_math_library_num_threads());
@@ -1185,8 +1191,7 @@ std::vector<std::string> AnalysisPredictor::GetOutputNames() {
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
     const std::string &name) {
   framework::Scope *scope;
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   if (config_.dist_config().use_dist_model()) {
     scope = scope_.get();
   } else {
@@ -1235,8 +1240,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
     const std::string &name) {
   framework::Scope *scope;
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   if (config_.dist_config().use_dist_model()) {
     scope = scope_.get();
   } else {
@@ -1283,8 +1287,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 }
 
 bool AnalysisPredictor::ZeroCopyRun() {
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   if (config_.dist_config().use_dist_model()) {
     VLOG(3) << "ZeroCopyRun will use the fleet executor.";
     inference::Timer timer;
@@ -1762,6 +1765,27 @@ namespace paddle_infer {
 Predictor::Predictor(const Config &config) {
   const_cast<Config *>(&config)->SwitchUseFeedFetchOps(false);
   // The second parameter indicates that the discard log is not printed
+  if (config.use_onnxruntime()) {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+    if (config.use_gpu()) {
+      LOG(WARNING) << "The current ONNXRuntime backend doesn't support GPU,"
+                      "and it falls back to use Paddle Inference.";
+    } else if (!paddle::CheckConvertToONNX(config)) {
+      LOG(WARNING)
+          << "Paddle2ONNX do't support convert the Model， fall back to using "
+             "Paddle Inference.";
+    } else {
+      predictor_ = paddle::CreatePaddlePredictor<
+          Config, paddle::PaddleEngineKind::kONNXRuntime>(config);
+      return;
+    }
+#else
+    LOG(WARNING)
+        << "The onnxruntime backend isn't enabled,"
+           " and please re-compile Paddle with WITH_ONNXRUNTIME option,"
+           "fall back to using Paddle Inference.";
+#endif
+  }
   predictor_ = paddle::CreatePaddlePredictor<
       Config, paddle::PaddleEngineKind::kAnalysis>(config);
 }
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 21a7e9658bbee..d9992f3fbef9d 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -18,8 +18,7 @@
 #include <memory>
 #include <string>
 #include <vector>
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #endif
 #include "paddle/fluid/framework/naive_executor.h"
@@ -395,8 +394,7 @@ class AnalysisPredictor : public PaddlePredictor {
   void StatisticShapeRangeInfo();
   void CollectShapeRangeInfo();
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   // fleet exe related
 
   ///
@@ -488,8 +486,7 @@ class AnalysisPredictor : public PaddlePredictor {
   std::map<std::string, std::vector<std::vector<int32_t>>> shape_info_;
   static int clone_num_;
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   // fleet executor related
   distributed::FleetExecutorDesc executor_desc_;
   std::shared_ptr<distributed::FleetExecutor> fleet_exe_;
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 9c7e5c6b27e68..ecb5eaf982548 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -357,6 +357,37 @@ TEST(AnalysisPredictor, set_xpu_device_id) {
 }
 #endif
 
+TEST(AnalysisPredictor, enable_onnxruntime) {
+  AnalysisConfig config;
+  config.EnableONNXRuntime();
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  ASSERT_TRUE(config.use_onnxruntime());
+#else
+  ASSERT_TRUE(!config.use_onnxruntime());
+#endif
+  config.EnableORTOptimization();
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  ASSERT_TRUE(config.ort_optimization_enabled());
+#else
+  ASSERT_TRUE(!config.ort_optimization_enabled());
+#endif
+  config.DisableONNXRuntime();
+  ASSERT_TRUE(!config.use_onnxruntime());
+}
+
+TEST(AnalysisPredictor, exp_enable_use_gpu_fp16) {
+  AnalysisConfig config;
+  config.SwitchIrOptim();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  config.EnableUseGpu(100, 0);
+  config.Exp_EnableUseGpuFp16();
+  ASSERT_TRUE(config.gpu_fp16_enabled());
+#else
+  config.DisableGpu();
+#endif
+  LOG(INFO) << config.Summary();
+}
+
 }  // namespace paddle
 
 namespace paddle_infer {
@@ -408,6 +439,27 @@ TEST(Predictor, Run) {
   predictor->TryShrinkMemory();
 }
 
+TEST(Predictor, EnableONNXRuntime) {
+  Config config;
+  config.SetModel(FLAGS_dirname);
+  config.EnableONNXRuntime();
+  config.EnableORTOptimization();
+  auto predictor = CreatePredictor(config);
+}
+
+TEST(Predictor, Exp_EnableUseGpuFp16) {
+  Config config;
+  config.SetModel(FLAGS_dirname);
+  config.SwitchIrOptim();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  config.EnableUseGpu(100, 0);
+  config.Exp_EnableUseGpuFp16();
+#else
+  config.DisableGpu();
+#endif
+  auto predictor = CreatePredictor(config);
+}
+
 TEST(Tensor, CpuShareExternalData) {
   Config config;
   config.SetModel(FLAGS_dirname);
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index d03840ada36bc..df98a7b05cf3f 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -4,6 +4,7 @@ option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL.
 option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
 option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
 option(USE_TENSORRT "Compile demo with TensorRT."   OFF)
+option(WITH_ONNXRUNTIME       "Compile demo with ONNXRuntime"       OFF)
 
 if(NOT WITH_STATIC_LIB)
   add_definitions("-DPADDLE_WITH_SHARED_LIB")
@@ -46,6 +47,13 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib")
 link_directories("${PADDLE_LIB}/paddle/lib")
+if (WITH_ONNXRUNTIME)
+  include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include")
+  include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include")
+
+  link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib")
+  link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib")
+endif()
 
 if (WIN32)
   add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
@@ -151,6 +159,17 @@ else()
   endif()
 endif()
 
+if (WITH_ONNXRUNTIME)
+  if(WIN32)
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx)
+  elseif(APPLE)
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx)
+  else()
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx)
+  endif()
+endif()
+
+
 if (NOT WIN32)
   set(EXTERNAL_LIB "-lrt -ldl -lpthread")
   set(DEPS ${DEPS}
@@ -213,6 +232,14 @@ if(WIN32)
           COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release
     )
   endif()
+  if(WITH_ONNXRUNTIME)
+    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll
+      ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+    COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll
+      ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+    )
+  endif()
   if(NOT WITH_STATIC_LIB)
       add_custom_command(TARGET ${DEMO_NAME} POST_BUILD 
         COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
diff --git a/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc
new file mode 100644
index 0000000000000..ef5c08cd041eb
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains demo of mobilenet for tensorrt.
+ */
+
+#include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
+#include <vector>
+#include "gflags/gflags.h"
+#include "utils.h"  // NOLINT
+
+DEFINE_string(modeldir, "", "Directory of the inference model.");
+
+namespace paddle {
+namespace demo {
+
+/*
+ * Use the onnxruntime engine to inference the demo.
+ */
+void Main() {
+  paddle::AnalysisConfig config;
+  config.EnableONNXRuntime();
+  config.SetModel(FLAGS_modeldir + "/inference.pdmodel",
+                  FLAGS_modeldir + "/inference.pdiparams");
+  auto predictor = paddle_infer::CreatePredictor(config);
+
+  // Inference.
+  std::vector<int> input_shape = {1, 3, 224, 224};
+  std::vector<float> input_data(1 * 3 * 224 * 224, 1.0);
+  std::vector<float> out_data;
+  out_data.resize(1000);
+  auto input_names = predictor->GetInputNames();
+  auto output_names = predictor->GetOutputNames();
+  auto input_tensor = predictor->GetInputHandle(input_names[0]);
+  input_tensor->Reshape(input_shape);
+  auto output_tensor = predictor->GetOutputHandle(output_names[0]);
+
+  input_tensor->CopyFromCpu(input_data.data());
+  predictor->Run();
+  output_tensor->CopyToCpu(out_data.data());
+
+  VLOG(3) << "output.size " << out_data.size();
+}
+
+}  // namespace demo
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::demo::Main();
+  return 0;
+}
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 5f062e8063253..2c0945cd5b386 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -21,7 +21,8 @@ TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
 DATA_DIR=$4 # dataset
 USE_TENSORRT=$5
 TENSORRT_ROOT_DIR=$6 # TensorRT root dir, default to /usr
-MSVC_STATIC_CRT=$7
+WITH_ONNXRUNTIME=$7
+MSVC_STATIC_CRT=$8
 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
 WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform
 
@@ -38,6 +39,30 @@ else
   use_gpu_list='false'
 fi
 
+mkdir -p $DATA_DIR
+cd $DATA_DIR
+
+if [ $7 == ON ]; then
+  ONNXRUNTIME_LIB=${inference_install_dir}/third_party/install/onnxruntime/lib
+  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ONNXRUNTIME_LIB}
+  PADDLE2ONNX_LIB=${inference_install_dir}/third_party/install/paddle2onnx/lib
+  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${PADDLE2ONNX_LIB}
+  #download model
+  mkdir -p MobileNetV2
+  cd MobileNetV2
+  if [[ -e "MobileNetV2.inference.model.tar.gz" ]]; then
+    echo "MobileNetV2.inference.model.tar.gz has been downloaded."
+  else
+    if [ $WIN_DETECT != "" ]; then
+      wget -q -Y off http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz
+    else
+      wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz
+    fi
+    tar xzf *.tar.gz
+  fi
+  cd ..
+fi
+
 PREFIX=inference-vis-demos%2F
 URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
 
@@ -58,8 +83,7 @@ function download() {
   fi
   cd ..
 }
-mkdir -p $DATA_DIR
-cd $DATA_DIR
+
 vis_demo_list='se_resnext50 ocr mobilenet'
 for vis_demo_name in $vis_demo_list; do
   download $vis_demo_name
@@ -93,7 +117,8 @@ for WITH_STATIC_LIB in ON OFF; do
       -DDEMO_NAME=simple_on_word2vec \
       -DWITH_GPU=$TEST_GPU_CPU \
       -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
+      -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
     msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
     for use_gpu in $use_gpu_list; do
       Release/simple_on_word2vec.exe \
@@ -112,7 +137,8 @@ for WITH_STATIC_LIB in ON OFF; do
       -DDEMO_NAME=vis_demo \
       -DWITH_GPU=$TEST_GPU_CPU \
       -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
+      -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
     msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
     for use_gpu in $use_gpu_list; do
       for vis_demo_name in $vis_demo_list; do
@@ -138,7 +164,8 @@ for WITH_STATIC_LIB in ON OFF; do
         -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
         -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
         -DUSE_TENSORRT=$USE_TENSORRT \
-        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR
+        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
+        -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
       msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
       Release/trt_mobilenet_demo.exe \
         --modeldir=$DATA_DIR/mobilenet/model \
@@ -156,7 +183,8 @@ for WITH_STATIC_LIB in ON OFF; do
       -DWITH_MKL=$TURN_ON_MKL \
       -DDEMO_NAME=simple_on_word2vec \
       -DWITH_GPU=$TEST_GPU_CPU \
-      -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
     make -j$(nproc)
     word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model'
     if [ -d $word2vec_model ]; then
@@ -176,7 +204,8 @@ for WITH_STATIC_LIB in ON OFF; do
       -DWITH_MKL=$TURN_ON_MKL \
       -DDEMO_NAME=vis_demo \
       -DWITH_GPU=$TEST_GPU_CPU \
-      -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
     make -j$(nproc)
     for use_gpu in $use_gpu_list; do
       for vis_demo_name in $vis_demo_list; do
@@ -200,7 +229,8 @@ for WITH_STATIC_LIB in ON OFF; do
         -DWITH_GPU=$TEST_GPU_CPU \
         -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
         -DUSE_TENSORRT=$USE_TENSORRT \
-        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR
+        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
+        -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
       make -j$(nproc)
       ./trt_mobilenet_demo \
         --modeldir=$DATA_DIR/mobilenet/model \
@@ -211,6 +241,26 @@ for WITH_STATIC_LIB in ON OFF; do
         exit 1
       fi
     fi
+
+    # --------onnxruntime mobilenetv2 on linux/mac------
+    if [ $WITH_ONNXRUNTIME == ON ]; then
+      rm -rf *
+      cmake .. -DPADDLE_LIB=${inference_install_dir} \
+        -DWITH_MKL=$TURN_ON_MKL \
+        -DDEMO_NAME=onnxruntime_mobilenet_demo \
+        -DWITH_GPU=$TEST_GPU_CPU \
+        -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+        -DUSE_TENSORRT=$USE_TENSORRT \
+        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
+        -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
+      make -j$(nproc)
+      ./onnxruntime_mobilenet_demo \
+        --modeldir=$DATA_DIR/MobileNetV2/MobileNetV2
+      if [ $? -ne 0 ]; then
+        echo "onnxruntime demo onnxruntime_mobilenet_demo runs fail."
+        exit 1
+      fi
+    fi
   fi
 done
 set +x
diff --git a/paddle/fluid/inference/api/details/CMakeLists.txt b/paddle/fluid/inference/api/details/CMakeLists.txt
index 4341fb0a9ccd8..b2cfb060dd325 100644
--- a/paddle/fluid/inference/api/details/CMakeLists.txt
+++ b/paddle/fluid/inference/api/details/CMakeLists.txt
@@ -14,7 +14,11 @@
 #
 
 cc_library(reset_tensor_array SRCS reset_tensor_array.cc DEPS lod_tensor scope)
-cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce)
+if (WITH_ONNXRUNTIME)
+    cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce onnxruntime)
+else (WITH_ONNXRUNTIME)
+    cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce)
+endif (WITH_ONNXRUNTIME)
 cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc)
 
 cc_test(zero_copy_tensor_test SRCS zero_copy_tensor_test.cc DEPS paddle_inference_api)
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 18b1d09f0e8a7..66dec0157d98e 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -22,12 +22,22 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/core/allocator.h"
+#ifdef PADDLE_WITH_ONNXRUNTIME
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+#endif
 
 namespace paddle_infer {
 
 using float16 = paddle::platform::float16;
 
 void Tensor::Reshape(const std::vector<int> &shape) {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    shape_.assign(shape.begin(), shape.end());
+    return;
+  }
+#endif
+
   PADDLE_ENFORCE_EQ(
       name_.empty(), false,
       paddle::platform::errors::PreconditionNotMet(
@@ -123,6 +133,11 @@ T *Tensor::data(PlaceType *place, int *size) const {
 }
 
 DataType Tensor::type() const {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    return dtype_;
+  }
+#endif
   EAGER_GET_TENSOR(paddle::framework::LoDTensor);
   auto type = paddle::framework::TransToProtoVarType(tensor->dtype());
   if (type == paddle::framework::proto::VarType::FP32) {
@@ -145,6 +160,13 @@ PlaceType Tensor::place() const { return place_; }
 
 template <typename T>
 void Tensor::CopyFromCpu(const T *data) {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    ORTCopyFromCpu<T>(data);
+    return;
+  }
+#endif
+
   EAGER_GET_TENSOR(paddle::framework::LoDTensor);
   PADDLE_ENFORCE_GE(tensor->numel(), 0,
                     paddle::platform::errors::PreconditionNotMet(
@@ -382,6 +404,13 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
 
 template <typename T>
 void Tensor::CopyToCpu(T *data) const {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    ORTCopyToCpu<T>(data);
+    return;
+  }
+#endif
+
   CopyToCpuImpl<T>(data, nullptr, nullptr, nullptr);
 }
 
@@ -489,12 +518,7 @@ template PD_INFER_DECL uint8_t *Tensor::mutable_data<uint8_t>(PlaceType place);
 template PD_INFER_DECL int8_t *Tensor::mutable_data<int8_t>(PlaceType place);
 template PD_INFER_DECL float16 *Tensor::mutable_data<float16>(PlaceType place);
 
-Tensor::Tensor(void *scope) : scope_{scope} {
-  PADDLE_ENFORCE_NOT_NULL(scope_,
-                          paddle::platform::errors::PreconditionNotMet(
-                              "The `scope` can not be nullptr. It should be "
-                              "set to the pointer of scope."));
-}
+Tensor::Tensor(void *scope) : scope_{scope} {}
 
 template <typename T>
 void *Tensor::FindTensor() const {
@@ -513,6 +537,26 @@ void *Tensor::FindTensor() const {
 }
 
 std::vector<int> Tensor::shape() const {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    std::vector<int> shape;
+    // input handle
+    if (idx_ < 0) {
+      shape.assign(shape_.begin(), shape_.end());
+    } else {  // output handle
+      auto binding = binding_.lock();
+      PADDLE_ENFORCE_NOT_NULL(binding,
+                              paddle::platform::errors::PreconditionNotMet(
+                                  "output tensor [%s] no binding ptr", name_));
+      std::vector<Ort::Value> outputs = binding->GetOutputValues();
+      Ort::Value &value = outputs[idx_];
+      auto info = value.GetTensorTypeAndShapeInfo();
+      auto ort_shape = info.GetShape();
+      shape.assign(ort_shape.begin(), ort_shape.end());
+    }
+    return shape;
+  }
+#endif
   EAGER_GET_TENSOR(paddle::framework::LoDTensor);
   PADDLE_ENFORCE_NOT_NULL(
       tensor_, paddle::platform::errors::PreconditionNotMet(
@@ -573,4 +617,99 @@ void Tensor::SetPlace(PlaceType place, int device) {
   device_ = device;
 }
 
+#ifdef PADDLE_WITH_ONNXRUNTIME
+void Tensor::SetOrtMark(bool is_ort_tensor) { is_ort_tensor_ = is_ort_tensor; }
+
+void Tensor::SetOrtBinding(const std::shared_ptr<Ort::IoBinding> binding) {
+  binding_ = binding;
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<float>(memory_info, data, size, shape,
+                                         shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int64_t *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<int64_t>(memory_info, data, size, shape,
+                                           shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int32_t *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<int32_t>(memory_info, data, size, shape,
+                                           shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, uint8_t *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<uint8_t>(memory_info, data, size, shape,
+                                           shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int8_t *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<int8_t>(memory_info, data, size, shape,
+                                          shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float16 *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor(memory_info, static_cast<void *>(data),
+                                  size * sizeof(float16), shape, shape_len,
+                                  ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16);
+}
+
+template <typename T>
+void Tensor::ORTCopyFromCpu(const T *data) {
+  auto binding = binding_.lock();
+  PADDLE_ENFORCE_NOT_NULL(binding,
+                          paddle::platform::errors::PreconditionNotMet(
+                              "input tensor [%s] no binding ptr", name_));
+  const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda";
+  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, device_,
+                              OrtMemTypeDefault);
+  size_t size = std::accumulate(begin(shape_), end(shape_), 1UL,
+                                std::multiplies<size_t>());
+  auto ort_value = GetOrtVaule(memory_info, const_cast<T *>(data), size,
+                               shape_.data(), shape_.size());
+  binding->BindInput(name_.c_str(), ort_value);
+}
+
+template <typename T>
+void Tensor::ORTCopyToCpu(T *data) const {
+  auto binding = binding_.lock();
+  PADDLE_ENFORCE_NOT_NULL(binding,
+                          paddle::platform::errors::PreconditionNotMet(
+                              "output tensor [%s] no binding ptr", name_));
+  std::vector<Ort::Value> outputs = binding->GetOutputValues();
+  Ort::Value &value = outputs[idx_];
+  auto info = value.GetTensorTypeAndShapeInfo();
+  size_t size = info.GetElementCount() * sizeof(T);
+
+  if (place_ == PlaceType::kCPU) {
+    std::memcpy(static_cast<void *>(data), value.GetTensorData<void *>(), size);
+  } else {
+    paddle::memory::Copy(paddle::platform::CPUPlace(),
+                         static_cast<void *>(data),
+                         paddle::platform::CUDAPlace(device_),
+                         value.GetTensorData<void>(), size, nullptr);
+  }
+}
+
+template void Tensor::ORTCopyFromCpu<float>(const float *data);
+template void Tensor::ORTCopyFromCpu<int64_t>(const int64_t *data);
+template void Tensor::ORTCopyFromCpu<int32_t>(const int32_t *data);
+template void Tensor::ORTCopyFromCpu<uint8_t>(const uint8_t *data);
+template void Tensor::ORTCopyFromCpu<int8_t>(const int8_t *data);
+template void Tensor::ORTCopyFromCpu<float16>(const float16 *data);
+
+template void Tensor::ORTCopyToCpu<float>(float *data) const;
+template void Tensor::ORTCopyToCpu<int32_t>(int32_t *data) const;
+template void Tensor::ORTCopyToCpu<uint8_t>(uint8_t *data) const;
+template void Tensor::ORTCopyToCpu<int8_t>(int8_t *data) const;
+template void Tensor::ORTCopyToCpu<float16>(float16 *data) const;
+#endif
+
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc
new file mode 100644
index 0000000000000..bd9de252a0962
--- /dev/null
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc
@@ -0,0 +1,307 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <fstream>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid//platform/device/gpu/gpu_types.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/version.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/inference/utils/io_utils.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+
+paddle_infer::DataType ConvertONNXType(ONNXTensorElementDataType type) {
+  switch (type) {
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
+      return paddle_infer::DataType::FLOAT32;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
+      return paddle_infer::DataType::FLOAT16;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
+      return paddle_infer::DataType::INT8;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
+      return paddle_infer::DataType::INT32;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
+      return paddle_infer::DataType::INT64;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:
+      return paddle_infer::DataType::UINT8;
+    default:
+      LOG(ERROR) << "unsupported ONNX Tensor Type: " << static_cast<int>(type);
+      return paddle_infer::DataType::FLOAT32;
+  }
+}
+
+bool CheckConvertToONNX(const AnalysisConfig &config) {
+  if (!config.model_dir().empty()) {
+    LOG(ERROR) << "Paddle2ONNX not support model_dir config";
+    // TODO(heliqi jiangjiajun): Paddle2ONNX not support
+    // config.model_dir() + "/__model__"
+    // config.model_dir() + var_name
+    return false;
+  } else if (config.prog_file().empty() || config.params_file().empty()) {
+    LOG(ERROR) << string::Sprintf(
+        "not valid model path '%s' or program path '%s' or params path '%s'.",
+        config.model_dir(), config.prog_file(), config.params_file());
+    return false;
+  }
+  return paddle2onnx::IsExportable(config.prog_file(), config.params_file(),
+                                   config.model_from_memory());
+}
+
+bool ONNXRuntimePredictor::Init() {
+  VLOG(3) << "ONNXRuntime Predictor::init()";
+
+  // Now ONNXRuntime only suuport CPU
+  const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu";
+  if (config_.use_gpu()) {
+    place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
+  } else {
+    place_ = paddle::platform::CPUPlace();
+  }
+
+  std::string onnx_proto;
+  paddle2onnx::Export(config_.prog_file(), config_.params_file(), &onnx_proto,
+                      config_.model_from_memory());
+
+  Ort::SessionOptions session_options;
+  if (config_.ort_optimization_enabled()) {
+    session_options.SetGraphOptimizationLevel(
+        GraphOptimizationLevel::ORT_ENABLE_ALL);
+  }
+  // Turn optimization off first, and then turn it on when it's stable
+  // session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+  // session_options.EnableCpuMemArena();
+  // session_options.EnableMemPattern();
+  // session_options.SetInterOpNumThreads(config_.cpu_math_library_num_threads());
+  session_options.SetIntraOpNumThreads(config_.cpu_math_library_num_threads());
+  VLOG(2) << "ONNXRuntime threads " << config_.cpu_math_library_num_threads();
+  if (config_.profile_enabled()) {
+    LOG(WARNING) << "ONNXRuntime Profiler is activated, which might affect the "
+                    "performance";
+#if defined(_WIN32)
+    session_options.EnableProfiling(L"ONNX");
+#else
+    session_options.EnableProfiling("ONNX");
+#endif
+  } else {
+    VLOG(2) << "ONNXRuntime Profiler is deactivated, and no profiling report "
+               "will be "
+               "generated.";
+  }
+  session_ = {env_, onnx_proto.data(), onnx_proto.size(), session_options};
+  binding_ = std::make_shared<Ort::IoBinding>(session_);
+
+  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
+                              place_.GetDeviceId(), OrtMemTypeDefault);
+  Ort::Allocator allocator(session_, memory_info);
+
+  size_t n_inputs = session_.GetInputCount();
+  for (size_t i = 0; i < n_inputs; ++i) {
+    auto input_name = session_.GetInputName(i, allocator);
+    auto type_info = session_.GetInputTypeInfo(i);
+    std::vector<int64_t> shape =
+        type_info.GetTensorTypeAndShapeInfo().GetShape();
+    ONNXTensorElementDataType data_type =
+        type_info.GetTensorTypeAndShapeInfo().GetElementType();
+    input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type});
+    allocator.Free(input_name);
+  }
+
+  size_t n_outputs = session_.GetOutputCount();
+  for (size_t i = 0; i < n_outputs; ++i) {
+    auto output_name = session_.GetOutputName(i, allocator);
+    auto type_info = session_.GetOutputTypeInfo(i);
+    std::vector<int64_t> shape =
+        type_info.GetTensorTypeAndShapeInfo().GetShape();
+    ONNXTensorElementDataType data_type =
+        type_info.GetTensorTypeAndShapeInfo().GetElementType();
+    output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type});
+
+    Ort::MemoryInfo out_memory_info(device_name, OrtDeviceAllocator,
+                                    place_.GetDeviceId(), OrtMemTypeDefault);
+    binding_->BindOutput(output_name, out_memory_info);
+
+    allocator.Free(output_name);
+  }
+  return true;
+}
+
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kONNXRuntime>(
+    const AnalysisConfig &config) {
+  if (config.glog_info_disabled()) {
+    FLAGS_logtostderr = 1;
+    FLAGS_minloglevel = 2;  // GLOG_ERROR
+  }
+
+  PADDLE_ENFORCE_EQ(
+      config.is_valid(), true,
+      platform::errors::InvalidArgument(
+          "Note: Each config can only be used for one predictor."));
+
+  VLOG(3) << "create ONNXRuntimePredictor";
+
+  std::unique_ptr<PaddlePredictor> predictor(new ONNXRuntimePredictor(config));
+  // Each config can only be used for one predictor.
+  config.SetInValid();
+  auto predictor_p = dynamic_cast<ONNXRuntimePredictor *>(predictor.get());
+
+  if (!predictor_p->Init()) {
+    return nullptr;
+  }
+
+  return predictor;
+}
+
+std::vector<std::string> ONNXRuntimePredictor::GetInputNames() {
+  std::vector<std::string> input_names;
+  for (auto input_desc : input_desc_) {
+    input_names.push_back(input_desc.name);
+  }
+  return input_names;
+}
+
+std::map<std::string, std::vector<int64_t>>
+ONNXRuntimePredictor::GetInputTensorShape() {
+  std::map<std::string, std::vector<int64_t>> input_shapes;
+  for (auto input_desc : input_desc_) {
+    input_shapes[input_desc.name] = input_desc.shape;
+  }
+  return input_shapes;
+}
+
+std::vector<std::string> ONNXRuntimePredictor::GetOutputNames() {
+  std::vector<std::string> output_names;
+  for (auto output_desc : output_desc_) {
+    output_names.push_back(output_desc.name);
+  }
+  return output_names;
+}
+
+bool ONNXRuntimePredictor::FindONNXDesc(const std::string &name,
+                                        bool is_input) {
+  if (is_input) {
+    for (auto i : input_desc_)
+      if (i.name == name) return true;
+  } else {
+    for (auto i : output_desc_)
+      if (i.name == name) return true;
+  }
+  return false;
+}
+
+std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
+    const std::string &name) {
+  PADDLE_ENFORCE_EQ(FindONNXDesc(name, true), true,
+                    platform::errors::PreconditionNotMet(
+                        "The in variable named %s is not found in the "
+                        "ONNXPredictor.",
+                        name));
+  std::unique_ptr<ZeroCopyTensor> res(new ZeroCopyTensor(nullptr));
+  res->input_or_output_ = true;
+  res->SetName(name);
+  if (platform::is_cpu_place(place_)) {
+    res->SetPlace(PaddlePlace::kCPU);
+  } else {
+    auto gpu_place = place_;
+    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
+  }
+  res->SetOrtMark(true);
+  res->SetOrtBinding(binding_);
+  return res;
+}
+
+std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetOutputTensor(
+    const std::string &name) {
+  PADDLE_ENFORCE_EQ(FindONNXDesc(name, false), true,
+                    platform::errors::PreconditionNotMet(
+                        "The out variable named %s is not found in the "
+                        "ONNXPredictor.",
+                        name));
+  std::unique_ptr<ZeroCopyTensor> res(new ZeroCopyTensor(nullptr));
+  res->input_or_output_ = false;
+  res->SetName(name);
+  if (platform::is_cpu_place(place_)) {
+    res->SetPlace(PaddlePlace::kCPU);
+  } else {
+    auto gpu_place = place_;
+    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
+  }
+  res->SetOrtMark(true);
+  res->SetOrtBinding(binding_);
+  int size = output_desc_.size();
+  for (int i = 0; i < size; ++i)
+    if (output_desc_[i].name == name) {
+      res->idx_ = i;
+      res->dtype_ = ConvertONNXType(output_desc_[i].dtype);
+      break;
+    }
+  return res;
+}
+
+bool ONNXRuntimePredictor::Run(const std::vector<PaddleTensor> &inputs,
+                               std::vector<PaddleTensor> *output_data,
+                               int batch_size) {
+  LOG(ERROR) << "Not support Run";
+  return false;
+}
+
+bool ONNXRuntimePredictor::ZeroCopyRun() {
+  try {
+    session_.Run({}, *(binding_.get()));
+  } catch (const std::exception &e) {
+    LOG(ERROR) << e.what();
+    return false;
+  }
+
+  return true;
+}
+
+std::unique_ptr<PaddlePredictor> ONNXRuntimePredictor::Clone() {
+  LOG(ERROR) << "Not support Clone(), Please create new Predictor";
+  return nullptr;
+}
+
+uint64_t ONNXRuntimePredictor::TryShrinkMemory() {
+  return paddle::memory::Release(place_);
+}
+
+ONNXRuntimePredictor::~ONNXRuntimePredictor() {
+  binding_->ClearBoundInputs();
+  binding_->ClearBoundOutputs();
+
+  memory::Release(place_);
+}
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h
new file mode 100644
index 0000000000000..d01756e4b96b1
--- /dev/null
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.h
@@ -0,0 +1,211 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/framework/op_compatible_info.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "onnxruntime_c_api.h"    // NOLINT
+#include "onnxruntime_cxx_api.h"  // NOLINT
+#include "paddle2onnx/converter.h"
+
+#ifdef PADDLE_WITH_TESTING
+#include <gtest/gtest.h>
+#include <gtest/gtest_prod.h>
+#endif
+
+///
+/// \file onnxruntime_predictor.h
+///
+/// \brief A predictor using ONNXRuntime
+///
+/// \author heliqi@baidu.com
+/// \date 2022-02-14
+/// \since 2.3.0
+///
+
+namespace paddle {
+
+bool CheckConvertToONNX(const AnalysisConfig &config);
+
+struct ONNXDesc {
+  std::string name;
+  std::vector<int64_t> shape;
+  ONNXTensorElementDataType dtype;
+};
+
+///
+/// \class ONNXRuntimePredictor
+///
+/// \brief The ONNXRuntimePredictor using ONNXRuntime for inference
+///
+/// The predictor has the following typical uses:
+///
+/// Get predictor
+/// \code{cpp}
+///   auto predictor = CreatePaddlePredictor(config);
+/// \endcode
+///
+/// Get input or output names
+/// \code{cpp}
+///   auto input_names = predictor->GetInputNames();
+///   auto output_names = predictor->GetOutputNames();
+/// \endcode
+///
+/// Get input or output tensors
+/// \code{cpp}
+///   auto input_t = predictor->GetInputTensor(input_names[0]);
+///   auto output_t = predictor->GetOutputTensor(output_names[0]);
+/// \endcode
+///
+/// Run predictor
+/// \code{cpp}
+///   predictor->ZeroCopyRun();
+/// \endcode
+///
+class ONNXRuntimePredictor : public PaddlePredictor {
+ public:
+  ///
+  /// \brief Construct a new ONNXRuntime Predictor object
+  ///
+  /// \param[in] AnalysisConfig config
+  ///
+  explicit ONNXRuntimePredictor(const AnalysisConfig &config)
+      : config_(config), env_(ORT_LOGGING_LEVEL_WARNING, "onnx") {
+    predictor_id_ = inference::GetUniqueId();
+  }
+  ///
+  /// \brief Destroy the ONNXRuntime Predictor object
+  ///
+  ~ONNXRuntimePredictor();
+
+  ///
+  /// \brief Initialize predictor
+  ///
+  /// \return Whether the init function executed successfully
+  ///
+  bool Init();
+
+  ///
+  /// \brief Get the input names
+  ///
+  /// \return input names
+  ///
+  std::vector<std::string> GetInputNames();
+
+  ///
+  /// \brief Get the output names
+  ///
+  /// \return output names
+  ///
+  std::vector<std::string> GetOutputNames();
+
+  ///
+  /// \brief Get the Input Tensor object
+  ///
+  /// \param[in] name input name
+  /// \return input tensor
+  ///
+  std::unique_ptr<ZeroCopyTensor> GetInputTensor(
+      const std::string &name) override;
+
+  ///
+  /// \brief Get the Output Tensor object
+  ///
+  /// \param[in] name otuput name
+  /// \return output tensor
+  ///
+  std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
+      const std::string &name) override;
+  ///
+  /// \brief Get all input names and their corresponding shapes
+  ///
+  /// \return the map of input names and shapes
+  ///
+  std::map<std::string, std::vector<int64_t>> GetInputTensorShape() override;
+
+  /// Not supoort
+  bool Run(const std::vector<PaddleTensor> &inputs,
+           std::vector<PaddleTensor> *output_data,
+           int batch_size = -1) override;
+
+  ///
+  /// \brief Run the prediction engine
+  ///
+  /// \return Whether the function executed successfully
+  ///
+  bool ZeroCopyRun() override;
+
+  ///
+  /// \brief Release all tmp tensor to compress the size of the memory pool.
+  /// The memory pool is considered to be composed of a list of chunks, if
+  /// the chunk is not occupied, it can be released.
+  ///
+  /// \return Number of bytes released. It may be smaller than the actual
+  /// released memory, because part of the memory is not managed by the
+  /// MemoryPool.
+  ///
+  uint64_t TryShrinkMemory() override;
+  ///
+  /// \brief Clone to get the new predictor. thread safe.
+  ///
+  /// \return get a new predictor
+  ///
+  std::unique_ptr<PaddlePredictor> Clone() override;
+
+ private:
+  ///
+  /// \brief Whether to find in/out by name.
+  ///
+  /// \param[in] name input or output name
+  ///
+  /// \param[in] is_input input(true) or output(false)
+  ///
+  /// \return Whether to find by name
+  ///
+  bool FindONNXDesc(const std::string &name, bool is_input);
+
+ private:
+  AnalysisConfig config_;
+
+  // ONNXRuntime
+  Ort::Env env_;
+  Ort::Session session_{nullptr};
+  std::shared_ptr<Ort::IoBinding> binding_;
+
+  platform::Place place_;
+  std::vector<ONNXDesc> input_desc_;
+  std::vector<ONNXDesc> output_desc_;
+  int predictor_id_;
+
+// Some more detailed tests, they are made the friends of the predictor, so that
+// the all the details can be tested.
+#if PADDLE_WITH_TESTING
+  FRIEND_TEST(ONNXRuntimePredictor, onnxruntime_on);
+#endif
+};
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
new file mode 100644
index 0000000000000..2be2de9c60bb1
--- /dev/null
+++ b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/inference/utils/io_utils.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+DEFINE_string(dirname, "", "dirname to tests.");
+
+namespace paddle {
+
+TEST(ONNXRuntimePredictor, onnxruntime_on) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_dirname + "/inference.pdmodel",
+                  FLAGS_dirname + "/inference.pdiparams");
+  config.EnableONNXRuntime();
+  config.EnableORTOptimization();
+  config.SetCpuMathLibraryNumThreads(2);
+  LOG(INFO) << config.Summary();
+
+  auto _predictor =
+      CreatePaddlePredictor<AnalysisConfig,
+                            paddle::PaddleEngineKind::kONNXRuntime>(config);
+  ASSERT_TRUE(_predictor);
+  auto* predictor = static_cast<ONNXRuntimePredictor*>(_predictor.get());
+
+  ASSERT_TRUE(predictor);
+  ASSERT_TRUE(!predictor->Clone());
+  ASSERT_TRUE(predictor->scope_);
+  ASSERT_TRUE(predictor->sub_scope_);
+  ASSERT_EQ(predictor->scope_->parent(), nullptr);
+  ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get());
+  // Dummy Input Data
+  std::vector<int64_t> input_shape = {-1, 3, 224, 224};
+  std::vector<float> input_data(1 * 3 * 224 * 224, 1.0);
+  std::vector<float> out_data;
+  out_data.resize(1000);
+
+  // testing all interfaces
+  auto input_names = predictor->GetInputNames();
+  auto output_names = predictor->GetOutputNames();
+  auto get_input_shape = predictor->GetInputTensorShape();
+
+  ASSERT_EQ(input_names.size(), 1UL);
+  ASSERT_EQ(output_names.size(), 1UL);
+  ASSERT_EQ(input_names[0], "inputs");
+  ASSERT_EQ(output_names[0], "save_infer_model/scale_0.tmp_1");
+  ASSERT_EQ(get_input_shape["inputs"], input_shape);
+
+  auto input_tensor = predictor->GetInputTensor(input_names[0]);
+  input_tensor->Reshape({1, 3, 224, 224});
+  auto output_tensor = predictor->GetOutputTensor(output_names[0]);
+
+  input_tensor->CopyFromCpu(input_data.data());
+  ASSERT_TRUE(predictor->ZeroCopyRun());
+  output_tensor->CopyToCpu(out_data.data());
+
+  predictor->TryShrinkMemory();
+}
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index b4a358394404f..bdfe0e46e9ca4 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -253,6 +253,19 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   ///
   void DisableGpu();
+  ///
+  /// \brief Enable GPU fp16 precision computation, in experimental state.
+  ///
+  /// \param op_list The operator type list.
+  ///
+  void Exp_EnableUseGpuFp16(std::unordered_set<std::string> op_list = {});
+  ///
+  /// \brief A boolean state telling whether the GPU fp16 precision is turned
+  /// on.
+  ///
+  /// \return bool Whether the GPU fp16 precision is turned on.
+  ///
+  bool gpu_fp16_enabled() const { return use_gpu_fp16_; }
 
   ///
   /// \brief Turn on XPU.
@@ -319,6 +332,18 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void EnableNpu(int device_id = 0);
   ///
+  /// \brief Turn on ONNXRuntime.
+  ///
+  void EnableONNXRuntime();
+  ///
+  /// \brief Turn off ONNXRuntime.
+  ///
+  void DisableONNXRuntime();
+  ///
+  /// \brief Turn on ONNXRuntime Optimization.
+  ///
+  void EnableORTOptimization();
+  ///
   /// \brief A boolean state telling whether the GPU is turned on.
   ///
   /// \return bool Whether the GPU is turned on.
@@ -342,6 +367,19 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   bool use_ipu() const { return use_ipu_; }
   ///
+  /// \brief A boolean state telling whether the ONNXRuntime is turned on.
+  ///
+  /// \return bool Whether the ONNXRuntime is turned on.
+  ///
+  bool use_onnxruntime() const { return use_onnxruntime_; }
+  ///
+  /// \brief A boolean state telling whether the ONNXRuntime Optimization is
+  /// turned on.
+  ///
+  /// \return bool Whether the ONNXRuntime Optimization is turned on.
+  ///
+  bool ort_optimization_enabled() const { return enable_ort_optimization_; }
+  ///
   /// \brief Get the GPU device id.
   ///
   /// \return int The GPU device id.
@@ -834,6 +872,9 @@ struct PD_INFER_DECL AnalysisConfig {
   int gpu_device_id_{0};
   uint64_t memory_pool_init_size_mb_{100};  // initial size is 100MB.
   bool thread_local_stream_{false};
+  bool use_gpu_fp16_{false};
+  std::unordered_set<std::string> gpu_fp16_disabled_op_types_{
+      "conv2d_fusion", "conv2d", "roll", "strided_slice"};
 
   bool use_cudnn_{false};
 
@@ -841,6 +882,10 @@ struct PD_INFER_DECL AnalysisConfig {
   bool use_npu_{false};
   int npu_device_id_{0};
 
+  // ONNXRuntime related
+  bool use_onnxruntime_{false};
+  bool enable_ort_optimization_{false};
+
   // Padding related
   bool use_fc_padding_{true};
 
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index c129efe494b4f..657dd9b600cce 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -192,6 +192,7 @@ class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor {
 
  private:
   friend class AnalysisPredictor;
+  friend class ONNXRuntimePredictor;
   explicit ZeroCopyTensor(void* scope) : paddle_infer::Tensor{scope} {}
 };
 
@@ -381,6 +382,7 @@ enum class PaddleEngineKind {
   kNative = 0,         ///< Use the native Fluid facility.
   kAutoMixedTensorRT,  ///< Automatically mix Fluid with TensorRT.
   kAnalysis,           ///< More optimization.
+  kONNXRuntime,        ///< Use ONNXRuntime
 };
 
 template <typename ConfigT, PaddleEngineKind engine>
@@ -395,6 +397,11 @@ template <>
 PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config);
 
+template <>
+PD_INFER_DECL std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kONNXRuntime>(
+    const AnalysisConfig& config);
+
 PD_INFER_DECL int PaddleDtypeSize(PaddleDType dtype);
 
 PD_INFER_DECL std::string get_version();
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index f5f36d805b43e..95975d8f2a892 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -172,6 +172,40 @@ void GpuPassStrategy::EnableCUDNN() {
   use_cudnn_ = true;
 }
 
+void GpuPassStrategy::Exp_EnableUseGpuFp16() {
+  passes_.assign({
+    "is_test_pass",                               //
+        "simplify_with_basic_ops_pass",           //
+        "conv_bn_fuse_pass",                      //
+        "conv_eltwiseadd_bn_fuse_pass",           //
+        "embedding_eltwise_layernorm_fuse_pass",  //
+        "multihead_matmul_fuse_pass_v2",          //
+        "gpu_cpu_squeeze2_matmul_fuse_pass",      //
+        "gpu_cpu_reshape2_matmul_fuse_pass",      //
+        "gpu_cpu_flatten2_matmul_fuse_pass",      //
+        "gpu_cpu_map_matmul_v2_to_mul_pass",      //
+        "gpu_cpu_map_matmul_v2_to_matmul_pass",   //
+        "gpu_cpu_map_matmul_to_mul_pass",         //
+        // "fc_fuse_pass",                        //
+        "fc_elementwise_layernorm_fuse_pass",  //
+#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
+                           // guaranteed at least v7
+// cudnn8.0 has memory leak problem in conv + eltwise + act, so we
+// disable the pass.
+#if !(CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8100)
+        "conv_elementwise_add_act_fuse_pass",   //
+        "conv_elementwise_add2_act_fuse_pass",  //
+#endif
+        "conv_elementwise_add_fuse_pass",      //
+#endif                                         //
+        "transpose_flatten_concat_fuse_pass",  //
+        "mixed_precision_configure_pass",      //
+        "runtime_context_cache_pass"           //
+  });
+
+  use_gpu_fp16_ = true;
+}
+
 void GpuPassStrategy::EnableMKLDNN() {
   LOG(ERROR) << "GPU not support MKLDNN yet";
 }
@@ -262,6 +296,7 @@ void CpuPassStrategy::EnableMKLDNN() {
              //  "fc_act_mkldnn_fuse_pass",
              "batch_norm_act_fuse_pass",              //
              "softplus_activation_mkldnn_fuse_pass",  //
+             "elt_act_mkldnn_fuse_pass",              //
              // TODO(intel): Please fix the bug on windows.
              // https://github.com/PaddlePaddle/Paddle/issues/29710
              // "mkldnn_inplace_pass",  // This pass should be activated after
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 351cf71e5ca74..02290ed33ff1c 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -125,6 +125,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   /// \brief Enable the use of cuDNN kernel.
   virtual void EnableCUDNN() {}
 
+  /// \brief Enable use gpu fp16 kernel.
+  virtual void Exp_EnableUseGpuFp16() {}
+
   /// \brief Enable the use of MKLDNN.
   /// The MKLDNN control exists in both CPU and GPU mode, because there can
   /// still be some CPU kernels running in GPU mode.
@@ -140,6 +143,10 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   /// \return A bool variable implying whether we are in gpu mode.
   bool use_gpu() const { return use_gpu_; }
 
+  /// \brief Check if we are using gpu fp16 kernel.
+  /// \return A bool variable implying whether we are in gpu fp16 mode.
+  bool use_gpu_fp16() const { return use_gpu_fp16_; }
+
   /// \brief Check if we are using xpu.
   /// \return A bool variable implying whether we are in xpu mode.
   bool use_xpu() const { return use_xpu_; }
@@ -162,6 +169,7 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   bool use_npu_{false};
   bool use_ipu_{false};
   bool use_mkldnn_{false};
+  bool use_gpu_fp16_{false};
   /// \endcond
 };
 
@@ -223,6 +231,9 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
   /// \brief Enable the use of cuDNN kernel.
   void EnableCUDNN() override;
 
+  /// \brief Enable the use of gpu fp16 kernel.
+  void Exp_EnableUseGpuFp16() override;
+
   /// \brief Not supported in GPU mode yet.
   void EnableMKLDNN() override;
 
@@ -238,6 +249,7 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
  protected:
   /// \cond Protected
   bool use_cudnn_{false};
+  bool use_gpu_fp16_{false};
   /// \endcond
 };
 
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 5a98d109aed79..2afe2d32e2f60 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -18,6 +18,11 @@
 
 #include "paddle_infer_declare.h"  // NOLINT
 
+#ifdef PADDLE_WITH_ONNXRUNTIME
+#include "onnxruntime_c_api.h"    // NOLINT
+#include "onnxruntime_cxx_api.h"  // NOLINT
+#endif
+
 namespace paddle_infer {
 
 /// \brief  Experimental.
@@ -175,6 +180,23 @@ class PD_INFER_DECL Tensor {
   PlaceType place_;
   int device_;
 
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  bool is_ort_tensor_{false};
+  std::vector<int64_t> shape_;
+  std::weak_ptr<Ort::IoBinding> binding_;
+  int idx_{-1};
+
+  void SetOrtMark(bool is_ort_tensor);
+
+  void SetOrtBinding(const std::shared_ptr<Ort::IoBinding> binding);
+
+  template <typename T>
+  void ORTCopyFromCpu(const T* data);
+
+  template <typename T>
+  void ORTCopyToCpu(T* data) const;
+#endif
+
   friend class paddle_infer::contrib::TensorUtils;
 #if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST)
   friend class paddle_infer::InferApiTesterUtils;
diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc
index e342190fda1ac..d7b07652babbd 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.cc
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -126,6 +126,26 @@ PD_Bool PD_ConfigUseGpu(__pd_keep PD_Config* pd_config) {
   return config->use_gpu();
 }
 
+void PD_ConfigEnableONNXRuntime(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableONNXRuntime();
+}
+
+void PD_ConfigDisableONNXRuntime(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->DisableONNXRuntime();
+}
+
+PD_Bool PD_ConfigONNXRuntimeEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->use_onnxruntime();
+}
+
+void PD_ConfigEnableORTOptimization(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableORTOptimization();
+}
+
 void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
                         int32_t l3_workspace_size, PD_Bool locked,
                         PD_Bool autotune, const char* autotune_file,
diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h
index c314aca918f14..f6b754cad213f 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.h
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
@@ -152,6 +152,34 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGpu(
 PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseGpu(
     __pd_keep PD_Config* pd_config);
 ///
+/// \brief Turn on ONNXRuntime.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableONNXRuntime(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn off ONNXRuntime.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDisableONNXRuntime(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the ONNXRutnime is turned on.
+///
+/// \return Whether the ONNXRuntime is turned on.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigONNXRuntimeEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on ONNXRuntime Optimization.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableORTOptimization(
+    __pd_keep PD_Config* pd_config);
+///
 /// \brief Turn on XPU.
 ///
 /// \param[in] pd_onfig config
diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go
index def26913b0a1c..8f9f34c06b476 100644
--- a/paddle/fluid/inference/goapi/config.go
+++ b/paddle/fluid/inference/goapi/config.go
@@ -160,6 +160,36 @@ func (config *Config) EnableUseGpu(memorySize uint64, deviceId int32) {
 	C.PD_ConfigEnableUseGpu(config.c, C.uint64_t(memorySize), C.int32_t(deviceId))
 }
 
+///
+/// \brief Turn on ONNXRuntime.
+///
+func (config *Config) EnableONNXRuntime() {
+	C.PD_ConfigEnableONNXRuntime(config.c)
+}
+
+///
+/// \brief Turn off ONNXRuntime.
+///
+func (config *Config) DisableONNXRuntime() {
+	C.PD_ConfigDisableONNXRuntime(config.c)
+}
+
+///
+/// \brief A boolean state telling whether the ONNXRuntime is turned on.
+///
+/// \return bool Whether the ONNXRuntime is turned on.
+///
+func (config *Config) ONNXRuntimeEnabled() bool {
+	return cvtPDBoolToGo(C.PD_ConfigONNXRuntimeEnabled(config.c))
+}
+
+///
+/// \brief Turn on ONNXRuntime Optimization.
+///
+func (config *Config) EnableORTOptimization() {
+	C.PD_ConfigEnableORTOptimization(config.c)
+}
+
 ///
 /// \brief Turn on XPU.
 ///
diff --git a/paddle/fluid/inference/goapi/config_test.go b/paddle/fluid/inference/goapi/config_test.go
index b82161880839e..297841dcbcf6c 100644
--- a/paddle/fluid/inference/goapi/config_test.go
+++ b/paddle/fluid/inference/goapi/config_test.go
@@ -122,3 +122,20 @@ func TestMkldnn(t *testing.T) {
 
 	config.SetBfloat16Op([]string{"fc", "mul"})
 }
+
+func TestONNXRuntime(t *testing.T) {
+	config := NewConfig()
+	config.SetModelDir("modelDir")
+	t.Log(config.ModelDir())
+
+	config.EnableONNXRuntime()
+	t.Logf("ONNXRuntimeEnabled:%+v", config.ONNXRuntimeEnabled())
+
+	config.DisableONNXRuntime()
+	t.Logf("ONNXRuntimeEnabled:%+v", config.ONNXRuntimeEnabled())
+
+	config.EnableORTOptimization()
+
+	config.SetCpuMathLibraryNumThreads(4)
+	t.Logf("CpuMathLibraryNumThreads:%+v", config.CpuMathLibraryNumThreads())
+}
\ No newline at end of file
diff --git a/paddle/fluid/inference/goapi/predictor_test.go b/paddle/fluid/inference/goapi/predictor_test.go
index 40e518304510c..755558f96238d 100644
--- a/paddle/fluid/inference/goapi/predictor_test.go
+++ b/paddle/fluid/inference/goapi/predictor_test.go
@@ -66,6 +66,42 @@ func TestNewPredictor(t *testing.T) {
 	cloned.ClearIntermediateTensor()
 }
 
+func TestONNXRuntimePredictor(t *testing.T) {
+	t.Logf("Version:\n%+v", Version())
+	config := NewConfig()
+	config.SetModel("./mobilenetv1/inference.pdmodel", "./mobilenetv1/inference.pdiparams")
+	config.EnableONNXRuntime()
+	config.EnableORTOptimization()
+	predictor := NewPredictor(config)
+	inNames := predictor.GetInputNames()
+	t.Logf("InputNames:%+v", inNames)
+	outNames := predictor.GetOutputNames()
+	t.Logf("OutputNames:%+v", outNames)
+
+	inHandle := predictor.GetInputHandle(inNames[0])
+	inHandle.Reshape([]int32{1, 3, 224, 224})
+	t.Logf("inHandle name:%+v, shape:%+v", inHandle.Name(), inHandle.Shape())
+
+	data := make([]float32, numElements([]int32{1, 3, 224, 224}))
+	for i := 0; i < int(numElements([]int32{1, 3, 224, 224})); i++ {
+		data[i] = float32(i%255) * 0.1
+	}
+	inHandle.CopyFromCpu(data)
+	t.Logf("inHandle Type:%+v", inHandle.Type())
+
+	predictor.Run()
+
+	outHandle := predictor.GetOutputHandle(outNames[0])
+	t.Logf("outHandle name:%+v", outHandle.Name())
+
+	outShape := outHandle.Shape()
+	t.Logf("outHandle Shape:%+v", outShape)
+	outData := make([]float32, numElements(outShape))
+	outHandle.CopyToCpu(outData)
+	t.Log(outData)
+}
+
+
 func TestFromBuffer(t *testing.T) {
 	modelFile, err := os.Open("./mobilenetv1/inference.pdmodel")
 	if err != nil {
diff --git a/paddle/fluid/inference/goapi/test.sh b/paddle/fluid/inference/goapi/test.sh
index edccc2648c012..cff9fd4aa7cea 100644
--- a/paddle/fluid/inference/goapi/test.sh
+++ b/paddle/fluid/inference/goapi/test.sh
@@ -22,6 +22,7 @@ fi
 
 # 2. set LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/mklml/lib/:$PWD/paddle_inference_c/third_party/install/mkldnn/lib/:$PWD/paddle_inference_c/paddle/lib/
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/onnxruntime/lib/:$PWD/paddle_inference_c/third_party/install/paddle2onnx/lib/
 
 # 3. go test
 go clean -testcache
diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
index 8c61200f7f57c..b69292827aa13 100644
--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
@@ -89,5 +89,5 @@ class DropoutOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(dropout);
+USE_OP_ITSELF(dropout);
 REGISTER_TRT_OP_CONVERTER(dropout, DropoutOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
index 67e7c78b62e9d..496e8932a690d 100644
--- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/layer_norm_op.h"
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index fe04d552e4026..7b65d2d7c97cc 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -328,5 +328,5 @@ class Pool2dOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(pool2d);
+USE_OP_ITSELF(pool2d);
 REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc
index b8e87a8d94d1f..5a306f622adbe 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc
@@ -224,5 +224,5 @@ class Pool3dOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(pool3d);
+USE_OP_ITSELF(pool3d);
 REGISTER_TRT_OP_CONVERTER(pool3d, Pool3dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index 7f7313fbcb596..1ad82df41737c 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -53,6 +53,6 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); }
 }  // namespace paddle
 
 USE_OP_ITSELF(relu);
-USE_OP(sigmoid);
-USE_OP(tanh);
+USE_OP_ITSELF(sigmoid);
+USE_OP_ITSELF(tanh);
 USE_OP(relu6);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
index b96992ef8514a..a856d14144469 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
 USE_OP_ITSELF(conv2d);
-USE_OP(conv2d_transpose);
+USE_OP_ITSELF(conv2d_transpose);
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
index 474fd92071fb0..cf37739608763 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
@@ -57,4 +57,4 @@ TEST(DropoutOpConverter, main) {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(dropout);
+USE_OP_ITSELF(dropout);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
index 1725888abc379..f17e00de0eeb7 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
@@ -45,4 +45,4 @@ TEST(leaky_relu_op, test_leaky_relu) {
 }  // namespace paddle
 
 // USE_OP(leaky_relu);
-USE_OP(leaky_relu);
+USE_OP_ITSELF(leaky_relu);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
index bded833505cd2..36f13262a73d7 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
@@ -71,4 +71,4 @@ TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); }
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(pool2d);
+USE_OP_ITSELF(pool2d);
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
index 861e98e443756..67d44184a76d0 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
@@ -17,7 +17,7 @@
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h"
-#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/phi/kernels/layer_norm_kernel.h"
 
 namespace paddle {
 namespace inference {
@@ -83,7 +83,7 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs,
   cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size,
                   cudaMemcpyHostToDevice, stream);
 
-  paddle::operators::LayerNormDirectCUDAFunctor<float> layer_norm;
+  phi::LayerNormDirectCUDAFunctor<float> layer_norm;
   layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d,
              variance_d, begin_norm_axis, eps);
   return cudaGetLastError() != cudaSuccess;
@@ -177,7 +177,7 @@ int LayerNormPluginDynamic::enqueue(
     cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size,
                     cudaMemcpyHostToDevice, stream);
 
-    paddle::operators::LayerNormDirectCUDAFunctor<float> layer_norm;
+    phi::LayerNormDirectCUDAFunctor<float> layer_norm;
     layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d,
                variance_d, begin_norm_axis, eps);
   } else {
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
index 861a9aa9d000b..5596a89a083fe 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h"
-#include "paddle/fluid/operators/math/pooling.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 namespace paddle {
 namespace inference {
@@ -108,16 +108,14 @@ int Pool3DPlugin::enqueue(int batchSize, const void *const *inputs,
   output_shape.insert(output_shape.begin(), batchSize);
 
   if (pool3d_type_ == Pool3DType::max) {
-    paddle::operators::math::MaxPool<float> pool_process;
-    paddle::operators::math::Pool3dDirectCUDAFunctor<
-        paddle::operators::math::MaxPool<float>, float>
+    phi::funcs::MaxPool<float> pool_process;
+    phi::funcs::Pool3dDirectCUDAFunctor<phi::funcs::MaxPool<float>, float>
         pool3d_forward;
     pool3d_forward(idata, input_shape, output_shape, ksize_, strides_,
                    paddings_, true, adaptive_, odatas[0], stream, pool_process);
   } else if (pool3d_type_ == Pool3DType::avg) {
-    paddle::operators::math::AvgPool<float> pool_process;
-    paddle::operators::math::Pool3dDirectCUDAFunctor<
-        paddle::operators::math::AvgPool<float>, float>
+    phi::funcs::AvgPool<float> pool_process;
+    phi::funcs::Pool3dDirectCUDAFunctor<phi::funcs::AvgPool<float>, float>
         pool3d_forward;
     pool3d_forward(idata, input_shape, output_shape, ksize_, strides_,
                    paddings_, true, adaptive_, odatas[0], stream, pool_process);
@@ -351,16 +349,14 @@ int Pool3DPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
   }
 
   if (pool3d_type_ == "max") {
-    paddle::operators::math::MaxPool<float> pool_process;
-    paddle::operators::math::Pool3dDirectCUDAFunctor<
-        paddle::operators::math::MaxPool<float>, float>
+    phi::funcs::MaxPool<float> pool_process;
+    phi::funcs::Pool3dDirectCUDAFunctor<phi::funcs::MaxPool<float>, float>
         pool3d_forward;
     pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
                    true, adaptive_, output, stream, pool_process);
   } else if (pool3d_type_ == "avg") {
-    paddle::operators::math::AvgPool<float> pool_process;
-    paddle::operators::math::Pool3dDirectCUDAFunctor<
-        paddle::operators::math::AvgPool<float>, float>
+    phi::funcs::AvgPool<float> pool_process;
+    phi::funcs::Pool3dDirectCUDAFunctor<phi::funcs::AvgPool<float>, float>
         pool3d_forward;
     pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
                    true, adaptive_, output, stream, pool_process);
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
index 6d711c26adc6f..9bfe98d759d8e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h"
-#include "paddle/fluid/operators/math/pooling.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 namespace paddle {
 namespace inference {
@@ -84,16 +84,14 @@ int PoolPlugin::enqueue(int batchSize, const void *const *inputs,
   output_shape.insert(output_shape.begin(), batchSize);
 
   if (pool_type_ == PoolType::max) {
-    paddle::operators::math::MaxPool<float> pool_process;
-    paddle::operators::math::Pool2dDirectCUDAFunctor<
-        paddle::operators::math::MaxPool<float>, float>
+    phi::funcs::MaxPool<float> pool_process;
+    phi::funcs::Pool2dDirectCUDAFunctor<phi::funcs::MaxPool<float>, float>
         pool2d_forward;
     pool2d_forward(idata, input_shape, output_shape, ksize_, strides_,
                    paddings_, true, false, odatas[0], stream, pool_process);
   } else if (pool_type_ == PoolType::avg) {
-    paddle::operators::math::AvgPool<float> pool_process;
-    paddle::operators::math::Pool2dDirectCUDAFunctor<
-        paddle::operators::math::AvgPool<float>, float>
+    phi::funcs::AvgPool<float> pool_process;
+    phi::funcs::Pool2dDirectCUDAFunctor<phi::funcs::AvgPool<float>, float>
         pool2d_forward;
     pool2d_forward(idata, input_shape, output_shape, ksize_, strides_,
                    paddings_, exclusive_, adaptive_, odatas[0], stream,
@@ -292,16 +290,14 @@ int PoolPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
   }
 
   if (pool_type_ == "max") {
-    paddle::operators::math::MaxPool<float> pool_process;
-    paddle::operators::math::Pool2dDirectCUDAFunctor<
-        paddle::operators::math::MaxPool<float>, float>
+    phi::funcs::MaxPool<float> pool_process;
+    phi::funcs::Pool2dDirectCUDAFunctor<phi::funcs::MaxPool<float>, float>
         pool2d_forward;
     pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
                    true, false, output, stream, pool_process);
   } else if (pool_type_ == "avg") {
-    paddle::operators::math::AvgPool<float> pool_process;
-    paddle::operators::math::Pool2dDirectCUDAFunctor<
-        paddle::operators::math::AvgPool<float>, float>
+    phi::funcs::AvgPool<float> pool_process;
+    phi::funcs::Pool2dDirectCUDAFunctor<phi::funcs::AvgPool<float>, float>
         pool2d_forward;
     pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
                    exclusive_, adaptive_, output, stream, pool_process);
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
index df0eb58c2bd58..a341ffd7a081c 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
@@ -81,6 +81,18 @@ TEST(PD_Config, interface) {
   PD_ConfigSetBfloat16Op(config, 1, &ops_name);
 #endif
 
+  PD_ConfigEnableONNXRuntime(config);
+  bool onnxruntime_enabled = PD_ConfigONNXRuntimeEnabled(config);
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  EXPECT_TRUE(onnxruntime_enabled);
+#else
+  EXPECT_FALSE(onnxruntime_enabled);
+#endif
+  PD_ConfigDisableONNXRuntime(config);
+  bool onnxruntime_disabled = PD_ConfigONNXRuntimeEnabled(config);
+  EXPECT_FALSE(onnxruntime_disabled);
+  PD_ConfigEnableORTOptimization(config);
+
   PD_ConfigEnableMemoryOptim(config, true);
   bool memory_enabled = PD_ConfigMemoryOptimEnabled(config);
   EXPECT_TRUE(memory_enabled);
diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
index 9d83f8ff8fdc4..f376cbd4fb302 100644
--- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
@@ -5,6 +5,7 @@ option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."
 option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   OFF)
 option(USE_TENSORRT "Compile demo with TensorRT."   OFF)
 option(WITH_GTEST "Compile demo with GTEST"   OFF)
+option(WITH_ONNXRUNTIME       "Compile demo with ONNXRuntime"       OFF)
 
 if(NOT WITH_STATIC_LIB)
   add_definitions("-DPADDLE_WITH_SHARED_LIB")
@@ -45,6 +46,13 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib")
 link_directories("${PADDLE_LIB}/paddle/lib")
+if (WITH_ONNXRUNTIME)
+  include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include")
+  include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include")
+
+  link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib")
+  link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib")
+endif()
 
 if (WIN32)
   add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
@@ -172,6 +180,16 @@ else()
   endif()
 endif()
 
+if (WITH_ONNXRUNTIME)
+  if(WIN32)
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx)
+  elseif(APPLE)
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx)
+  else()
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx)
+  endif()
+endif()
+
 if (NOT WIN32)
   set(EXTERNAL_LIB "-lrt -ldl -lpthread")
   set(DEPS ${DEPS}
@@ -248,6 +266,14 @@ if(WIN32)
           COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release
     )
   endif()
+  if(WITH_ONNXRUNTIME)
+    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll
+      ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+    COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll
+      ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+    )
+  endif()
   if(NOT WITH_STATIC_LIB)
       add_custom_command(TARGET ${DEMO_NAME} POST_BUILD 
         COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
diff --git a/paddle/fluid/inference/tests/infer_ut/run.sh b/paddle/fluid/inference/tests/infer_ut/run.sh
index dd4b64f28d739..8123d37850034 100755
--- a/paddle/fluid/inference/tests/infer_ut/run.sh
+++ b/paddle/fluid/inference/tests/infer_ut/run.sh
@@ -20,7 +20,8 @@ TURN_ON_MKL=$2 # use MKL or Openblas
 TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
 DATA_DIR=$4 # dataset
 TENSORRT_ROOT_DIR=$5 # TensorRT ROOT dir, default to /usr/local/TensorRT
-MSVC_STATIC_CRT=$6
+WITH_ONNXRUNTIME=$6
+MSVC_STATIC_CRT=$7
 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
 EXIT_CODE=0 # init default exit code
 WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform
@@ -144,7 +145,8 @@ function compile_test() {
              -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
              -DWITH_GTEST=ON \
              -DCMAKE_CXX_FLAGS='/std:c++17' \
-             -DCMAKE_BUILD_TYPE=Release
+             -DCMAKE_BUILD_TYPE=Release \
+             -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
         msbuild /maxcpucount /property:Configuration=Release ALL_BUILD.vcxproj
     else
         cmake .. -DPADDLE_LIB=${inference_install_dir} \
@@ -154,7 +156,8 @@ function compile_test() {
                  -DWITH_STATIC_LIB=OFF \
                  -DUSE_TENSORRT=$USE_TENSORRT \
                  -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
-                 -DWITH_GTEST=ON
+                 -DWITH_GTEST=ON \
+                 -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
         make -j$(nproc)
     fi;
     cd -
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index 05c468b798886..6b6c0cd22f03b 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -80,6 +80,14 @@ if(NOT EXISTS ${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inferenc
 endif()
 set(IMG_CLS_RESNET_MODEL_DIR "${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model")
 
+if(WITH_ONNXRUNTIME)
+  set(MOBILENETV2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/MobileNetV2")
+  if(NOT EXISTS ${MOBILENETV2_INSTALL_DIR}/MobileNetV2.inference.model.tar.gz)
+    inference_download_and_uncompress_without_verify(${MOBILENETV2_INSTALL_DIR} ${INFERENCE_URL} "MobileNetV2.inference.model.tar.gz")
+  endif()
+  set(MOBILENETV2_MODEL_DIR "${MOBILENETV2_INSTALL_DIR}/MobileNetV2")
+endif()
+
 function (inference_base_test_build TARGET)
    set(options "")
    set(oneValueArgs "")
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index a7a417c29a7bd..f296ce96d4e5f 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -131,4 +131,7 @@ cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_aut
 if(NOT WIN32)
   cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator)
   cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator)
+  if (WITH_GPU)
+    cc_library(cuda_ipc_allocator SRCS cuda_ipc_allocator.cc DEPS allocator)
+  endif()
 endif(NOT WIN32)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 61e292a922f0e..4a44448dc84cf 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -219,6 +219,12 @@ class AllocatorFacadePrivate {
         }
         InitNaiveBestFitCUDAPinnedAllocator();
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
+        }
+        InitNaiveBestFitNPUPinnedAllocator();
+#endif
 #ifdef PADDLE_WITH_XPU
         for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
diff --git a/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc
new file mode 100644
index 0000000000000..b2f24d5aed1eb
--- /dev/null
+++ b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _WIN32
+
+#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <random>
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+namespace {
+std::mutex ipc_mutex_;
+std::unordered_map<std::string, std::weak_ptr<void>> ipc_handle_to_baseptr_;
+}  // namespace
+
+std::shared_ptr<void> GetIpcBasePtr(std::string handle) {
+  std::lock_guard<std::mutex> lock(ipc_mutex_);
+
+  auto iter = ipc_handle_to_baseptr_.find(handle);
+  if (iter != ipc_handle_to_baseptr_.end()) {
+    auto baseptr = iter->second.lock();
+    if (baseptr) return baseptr;
+  }
+  // The IpcMemHandle can only open once for the same handle,
+  // so here we cache it here.
+  void *baseptr = nullptr;
+  auto ipc_handle =
+      reinterpret_cast<const cudaIpcMemHandle_t *>(handle.c_str());
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcOpenMemHandle(
+      &baseptr, *ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+  // Close ipc handle on the same device.
+  int device_id = platform::GetCurrentDeviceId();
+  // Add deleter to close ipc handle.
+  auto sp = std::shared_ptr<void>(baseptr, [handle, device_id](void *ptr) {
+    platform::CUDADeviceGuard guard(device_id);
+    std::lock_guard<std::mutex> lock(ipc_mutex_);
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcCloseMemHandle(ptr));
+    ipc_handle_to_baseptr_.erase(handle);
+    VLOG(6) << "cudaIpcCloseMemHandle for ptr:"
+            << "\t" << ptr;
+  });
+  std::weak_ptr<void> wp = sp;
+  ipc_handle_to_baseptr_.insert(iter, {handle, wp});
+
+  return sp;
+}
+
+CudaIpcAllocation::~CudaIpcAllocation() {
+  shared_ptr_.reset();
+  VLOG(6) << "tensor deleted cudaIpcCloseMemHandle for ptr:"
+          << "\t" << this->ptr();
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/memory/allocation/cuda_ipc_allocator.h b/paddle/fluid/memory/allocation/cuda_ipc_allocator.h
new file mode 100644
index 0000000000000..52e3cf10ea73a
--- /dev/null
+++ b/paddle/fluid/memory/allocation/cuda_ipc_allocator.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _WIN32
+#pragma once
+
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <unordered_set>
+#include <utility>
+
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+std::shared_ptr<void> GetIpcBasePtr(std::string handle);
+
+class CudaIpcAllocation : public Allocation {
+ public:
+  explicit CudaIpcAllocation(void *ptr, size_t size, int device_id,
+                             std::shared_ptr<void> shared_ptr)
+      : Allocation(ptr, size, platform::CUDAPlace(device_id)),
+        device_id_(std::move(device_id)),
+        shared_ptr_(std::move(shared_ptr)) {}
+
+  inline const int &device_id() const { return device_id_; }
+
+  ~CudaIpcAllocation() override;
+
+ private:
+  int device_id_;
+  std::shared_ptr<void> shared_ptr_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/memory/allocation/mmap_allocator.cc b/paddle/fluid/memory/allocation/mmap_allocator.cc
index acaf5d548555c..25c2235cce853 100644
--- a/paddle/fluid/memory/allocation/mmap_allocator.cc
+++ b/paddle/fluid/memory/allocation/mmap_allocator.cc
@@ -29,6 +29,155 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+std::string GetIPCName() {
+  static std::random_device rd;
+  std::string handle = "/paddle_";
+#ifdef _WIN32
+  handle += std::to_string(GetCurrentProcessId());
+#else
+  handle += std::to_string(getpid());
+#endif
+  handle += "_";
+  handle += std::to_string(rd());
+  return handle;
+}
+
+struct CountInfo {
+  std::atomic<int> refcount;
+};
+
+void AllocateMemoryMap(std::string filename, int flags, size_t size,
+                       void **map_ptr_, int *fd_) {
+  // TODO(@ZHUI): support win32
+  int file_flags = 0;
+  int fd = -1;
+  if (flags & MAPPED_SHAREDMEM) {
+    file_flags = O_RDWR | O_CREAT;
+  } else {
+    file_flags = O_RDONLY;
+  }
+  if (flags & MAPPED_EXCLUSIVE) {
+    file_flags |= O_EXCL;
+  }
+  if (flags & MAPPED_NOCREATE) {
+    file_flags &= ~O_CREAT;
+  }
+
+  if (!(flags & MAPPED_FROMFD)) {
+    if (flags & MAPPED_SHAREDMEM) {
+      fd = shm_open(filename.c_str(), file_flags, (mode_t)0600);
+      PADDLE_ENFORCE_NE(
+          fd, -1,
+          platform::errors::Unavailable(
+              "File descriptor %s open failed, unable in read-write mode",
+              filename.c_str()));
+      VLOG(6) << "shm_open: " << filename;
+    }
+  } else {
+    fd = -1;
+  }
+
+  PADDLE_ENFORCE_EQ(ftruncate(fd, size), 0,
+                    platform::errors::Unavailable(
+                        "Fruncate a file to a specified length failed!"));
+
+  if (flags & MAPPED_SHAREDMEM) {
+    *map_ptr_ = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+  } else {
+    *map_ptr_ = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+  }
+
+  PADDLE_ENFORCE_NE(*map_ptr_, MAP_FAILED,
+                    platform::errors::Unavailable(
+                        "Memory map failed when create shared memory."));
+
+  if (flags & MAPPED_KEEPFD) {
+    *fd_ = fd;
+  } else {
+    PADDLE_ENFORCE_NE(::close(fd), -1,
+                      platform::errors::Unavailable(
+                          "Error closing memory maped file <", filename, ">"));
+
+    *fd_ = -1;
+  }
+}
+
+std::shared_ptr<RefcountedMemoryMapAllocation>
+AllocateRefcountedMemoryMapAllocation(std::string filename, int flags,
+                                      size_t size) {
+  int fd = -1;
+  void *base_ptr = nullptr;
+  AllocateMemoryMap(filename, flags, size + mmap_alignment, &base_ptr, &fd);
+  void *aliged_base_ptr =
+      static_cast<void *>(static_cast<char *>(base_ptr) + mmap_alignment);
+  return std::make_shared<RefcountedMemoryMapAllocation>(aliged_base_ptr, size,
+                                                         filename, flags, fd);
+}
+
+RefcountedMemoryMapAllocation::RefcountedMemoryMapAllocation(
+    void *ptr, size_t size, std::string ipc_name, int fd, int flags)
+    : MemoryMapAllocation(ptr, size, ipc_name, fd, flags) {
+  // must reset base ptr first.
+  resetBaseptr();
+  initializeRefercount();
+}
+
+void MemoryMapAllocation::close() {
+  if (closed_) {
+    return;
+  }
+  closed_ = true;
+}
+
+MemoryMapAllocation::~MemoryMapAllocation() { close(); }
+
+void RefcountedMemoryMapAllocation::incref() {
+  CountInfo *info = static_cast<CountInfo *>(map_ptr_);
+  ++info->refcount;
+}
+
+int RefcountedMemoryMapAllocation::decref() {
+  CountInfo *info = static_cast<CountInfo *>(map_ptr_);
+  return --info->refcount == 0;
+}
+
+void RefcountedMemoryMapAllocation::resetBaseptr() {
+  map_ptr_ =
+      static_cast<void *>(static_cast<char *>(map_ptr_) - mmap_alignment);
+  map_size_ = map_size_ + mmap_alignment;
+}
+
+void RefcountedMemoryMapAllocation::initializeRefercount() {
+  CountInfo *info = reinterpret_cast<CountInfo *>(map_ptr_);
+
+  if (flags_ & MAPPED_EXCLUSIVE) {
+    new (&info->refcount) std::atomic<int>(1);
+  } else {
+    info->refcount++;
+  }
+}
+
+void RefcountedMemoryMapAllocation::close() {
+  if (closed_) {
+    return;
+  }
+  closed_ = true;
+  void *data = map_ptr_;
+  CountInfo *info = reinterpret_cast<CountInfo *>(data);
+  if (--info->refcount == 0) {
+    PADDLE_ENFORCE_NE(
+        shm_unlink(ipc_name_.c_str()), -1,
+        platform::errors::Unavailable(
+            "could not unlink the shared memory file ", ipc_name_));
+    VLOG(6) << "shm_unlink file: " << ipc_name_;
+  }
+
+  PADDLE_ENFORCE_NE(
+      munmap(map_ptr_, map_size_), -1,
+      platform::errors::Unavailable("could not unmap the shared memory file: ",
+                                    strerror(errno), " (", errno, ")"));
+}
+
 MemoryMapWriterAllocation::~MemoryMapWriterAllocation() {
   PADDLE_ENFORCE_NE(
       munmap(this->ptr(), this->size()), -1,
@@ -44,30 +193,30 @@ MemoryMapReaderAllocation::~MemoryMapReaderAllocation() {
   /* Here we do not pay attention to the result of shm_unlink,
      because the memory mapped file may have been cleared due to the
      MemoryMapFdSet::Clear() */
+
+  // Code of DataLoader subprocess:
+  //
+  //    core._array_to_share_memory_tensor(b)
+  //    out_queue.put((idx, tensor_list, structure))
+  //    core._remove_tensor_list_mmap_fds(tensor_list)
+
+  /* If the tensor in already in the send queue, the tensor will be
+   * deconstructed by the function. If the tensor not send yet, it
+   * will be cleared by MemoryMapFdSet::Clear().
+   * If the `_remove_tensor_list_mmap_fds` have be interrupted, the
+   * tensor will be cleared by both methods.
+   * */
+
   shm_unlink(this->ipc_name().c_str());
   MemoryMapFdSet::Instance().Remove(this->ipc_name());
   VLOG(3) << "~MemoryMapReaderAllocation: " << this->ipc_name();
 }
 
-std::string GetIPCName() {
-  static std::random_device rd;
-  std::string handle = "/paddle_";
-#ifdef _WIN32
-  handle += std::to_string(GetCurrentProcessId());
-#else
-  handle += std::to_string(getpid());
-#endif
-  handle += "_";
-  handle += std::to_string(rd());
-  return handle;
-}
-
 std::shared_ptr<MemoryMapWriterAllocation> AllocateMemoryMapWriterAllocation(
     size_t size) {
   const std::string &ipc_name = GetIPCName();
   int flags = O_RDWR | O_CREAT;
-
-  int fd = shm_open(ipc_name.c_str(), flags, 0644);
+  int fd = shm_open(ipc_name.c_str(), flags, 0600);
   PADDLE_ENFORCE_NE(
       fd, -1, platform::errors::Unavailable("File descriptor %s open failed",
                                             ipc_name.c_str()));
@@ -86,12 +235,14 @@ std::shared_ptr<MemoryMapWriterAllocation> AllocateMemoryMapWriterAllocation(
 
 std::shared_ptr<MemoryMapReaderAllocation> RebuildMemoryMapReaderAllocation(
     const std::string &ipc_name, size_t size) {
-  int fd = shm_open(ipc_name.c_str(), O_RDONLY, 0644);
+  int flags = O_RDWR | O_CREAT;
+  flags &= ~O_CREAT;
+
+  int fd = shm_open(ipc_name.c_str(), flags, 0600);
   PADDLE_ENFORCE_NE(
       fd, -1, platform::errors::Unavailable("File descriptor %s open failed",
                                             ipc_name.c_str()));
-
-  void *ptr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
+  void *ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
   PADDLE_ENFORCE_NE(ptr, MAP_FAILED,
                     platform::errors::Unavailable(
                         "Memory map failed when rebuild shared memory."));
diff --git a/paddle/fluid/memory/allocation/mmap_allocator.h b/paddle/fluid/memory/allocation/mmap_allocator.h
index 3f91e5c427808..4f8dbfbb51e66 100644
--- a/paddle/fluid/memory/allocation/mmap_allocator.h
+++ b/paddle/fluid/memory/allocation/mmap_allocator.h
@@ -16,8 +16,9 @@
 
 #ifndef _WIN32
 
+#include <atomic>
 #include <memory>
-#include <mutex>  // NOLINT
+#include <mutex>
 #include <string>
 #include <unordered_set>
 #include <utility>
@@ -28,6 +29,72 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+std::string GetIPCName();
+
+static constexpr int64_t mmap_alignment = 64;
+
+enum MappedModes {
+  MAPPED_SHAREDMEM = 1,
+  MAPPED_EXCLUSIVE = 2,
+  MAPPED_NOCREATE = 4,
+  MAPPED_KEEPFD = 8,
+  MAPPED_FROMFD = 16,
+  MAPPED_UNLINK = 32
+};
+
+class MemoryMapAllocation : public Allocation {
+ public:
+  explicit MemoryMapAllocation(void *ptr, size_t size, std::string ipc_name)
+      : Allocation(ptr, size, platform::CPUPlace()),
+        ipc_name_(std::move(ipc_name)),
+        map_ptr_(ptr),
+        map_size_(size) {}
+  explicit MemoryMapAllocation(void *ptr, size_t size, std::string ipc_name,
+                               int flags, int fd)
+      : Allocation(ptr, size, platform::CPUPlace()),
+        ipc_name_(std::move(ipc_name)),
+        fd_(fd),
+        flags_(flags),
+        map_ptr_(ptr),
+        map_size_(size) {}
+
+  inline const std::string &ipc_name() const { return ipc_name_; }
+
+  virtual void close();
+
+  ~MemoryMapAllocation() override;
+
+ protected:
+  std::string ipc_name_;
+  int fd_ = -1;
+  int flags_ = 0;
+  void *map_ptr_ = nullptr;
+  size_t map_size_ = 0;
+  bool closed_ = false;
+};
+
+class RefcountedMemoryMapAllocation : public MemoryMapAllocation {
+ public:
+  RefcountedMemoryMapAllocation(void *ptr, size_t size, std::string ipc_name,
+                                int flags, int fd);
+
+  void incref();
+  int decref();
+  void close() override;
+  virtual ~RefcountedMemoryMapAllocation() { close(); }
+
+ protected:
+  void initializeRefercount();
+  void resetBaseptr();
+};
+
+void AllocateMemoryMap(std::string filename, int flags, size_t size,
+                       void **base_ptr_, int *fd_);
+
+std::shared_ptr<RefcountedMemoryMapAllocation>
+AllocateRefcountedMemoryMapAllocation(std::string filename, int flags,
+                                      size_t size);
+
 class MemoryMapWriterAllocation : public Allocation {
  public:
   explicit MemoryMapWriterAllocation(void *ptr, size_t size,
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 91a0352e1915e..e77be832c0cc8 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -161,7 +161,7 @@ cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEP
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function
 lod_tensor maxouting unpooling pooling lod_rank_table context_project
-sequence_pooling segment_pooling executor device_memory_aligment generator)
+sequence_pooling executor device_memory_aligment generator)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse matrix_solve)
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 66f1bcc8b6869..845d0ed073b32 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1482,6 +1482,20 @@ REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor);
 REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor);
 REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor);
 REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor);
+REGISTER_ACTIVATION_OP(brelu, BRelu, BReluFunctor, BReluGradFunctor);
+REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedRelu,
+                       ThresholdedReluFunctor, ThresholdedReluGradFunctor);
+REGISTER_ACTIVATION_OP(hard_shrink, HardShrink, HardShrinkFunctor,
+                       HardShrinkGradFunctor);
+REGISTER_ACTIVATION_OP(softshrink, SoftShrink, SoftShrinkFunctor,
+                       SoftShrinkGradFunctor);
+REGISTER_ACTIVATION_OP(tanh_shrink, TanhShrink, TanhShrinkFunctor,
+                       TanhShrinkGradFunctor);
+REGISTER_ACTIVATION_OP(silu, Silu, SiluFunctor, SiluGradFunctor);
+REGISTER_ACTIVATION_OP(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,
+                       HardSigmoidGradFunctor);
+REGISTER_ACTIVATION_OP(logsigmoid, LogSigmoid, LogSigmoidFunctor,
+                       LogSigmoidGradFunctor);
 
 /* ==========================    sigmoid register  =============================
  */
@@ -1516,30 +1530,6 @@ REGISTER_OPERATOR(sigmoid_triple_grad,
                       ops::SigmoidTripleGradFunctor<float>::FwdDeps()>,
                   ops::ActivationTripleGradOpInplaceInferer);
 
-// Register Sigmoid/GradSigmoid Kernels
-REGISTER_ACTIVATION_CPU_KERNEL(sigmoid, Sigmoid, SigmoidFunctor,
-                               SigmoidGradFunctor);
-
-// Register DoubleGrad Kernel
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_grad_grad,
-    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidGradGradFunctor<float>>,
-    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidGradGradFunctor<double>>,
-    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidGradGradFunctor<plat::float16>>);
-
-// Register TripleGrad Kernel
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_triple_grad,
-    ops::SigmoidTripleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidTripleGradFunctor<float>>,
-    ops::SigmoidTripleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidTripleGradFunctor<double>>,
-    ops::SigmoidTripleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidTripleGradFunctor<plat::float16>>);
-
 /* ========================================================================== */
 
 /* ==========================    tanh register  ============================= */
@@ -1567,23 +1557,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpTripleGrad<ops::TanhTripleGradFunctor<float>::FwdDeps()>,
     ops::ActivationTripleGradOpInplaceInferer);
 
-REGISTER_ACTIVATION_CPU_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor);
-REGISTER_OP_CPU_KERNEL(
-    tanh_grad_grad, ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
-                                              ops::TanhGradGradFunctor<float>>,
-    ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
-                              ops::TanhGradGradFunctor<double>>,
-    ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
-                              ops::TanhGradGradFunctor<plat::float16>>);
-// Register TripleGrad Kernel
-REGISTER_OP_CPU_KERNEL(
-    tanh_triple_grad,
-    ops::TanhTripeGradKernel<plat::CPUDeviceContext,
-                             ops::TanhTripleGradFunctor<float>>,
-    ops::TanhTripeGradKernel<plat::CPUDeviceContext,
-                             ops::TanhTripleGradFunctor<double>>,
-    ops::TanhTripeGradKernel<plat::CPUDeviceContext,
-                             ops::TanhTripleGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ==========================    relu register  ============================= */
@@ -1623,16 +1596,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpDoubleGrad2<ops::LeakyReluGradFunctor<float>::FwdDeps()>,
     ops::ActivationDoubleGradOpInplaceInferer);
 
-REGISTER_ACTIVATION_CPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor,
-                               LeakyReluGradFunctor);
-REGISTER_OP_CPU_KERNEL(
-    leaky_relu_grad_grad,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::LeakyReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::LeakyReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<
-        plat::CPUDeviceContext, ops::LeakyReluGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ========================    elu  register     ============================ */
@@ -1650,22 +1613,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpDoubleGrad<ops::ELUGradFunctor<float>::FwdDeps()>,
     ops::ActivationDoubleGradOpInplaceInferer);
 
-REGISTER_OP_CPU_KERNEL(elu,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::ELUFunctor<float>>,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::ELUFunctor<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elu_grad, ops::ELUGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ELUGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    elu_grad_grad, ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
-                                            ops::ELUGradGradFunctor<float>>,
-    ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
-                             ops::ELUGradGradFunctor<double>>,
-    ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
-                             ops::ELUGradGradFunctor<plat::float16>>);
-
 /* ========================================================================== */
 
 /* ========================    logit  register     ============================
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 4b79397b6cdf2..f1984af6e15ea 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -238,21 +238,20 @@ struct BaseActivationFunctor {
   AttrPair GetAttrs() { return AttrPair(); }
 };
 
-// sigmoid(x) = 1 / (1 + exp(-x))
-template <typename T>
-struct SigmoidFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
-  }
-};
-
 #define USE_PHI_FUNCTOR(name)                         \
   template <typename T>                               \
   using name##Functor = phi::funcs::name##Functor<T>; \
   template <typename T>                               \
   using name##GradFunctor = phi::funcs::name##GradFunctor<T>;
 
+#define USE_PHI_DOUBLE_GRAD_FUNCTOR(name) \
+  template <typename T>                   \
+  using name##GradGradFunctor = phi::funcs::name##GradGradFunctor<T>;
+
+#define USE_PHI_TRIPLE_GRAD_FUNCTOR(name) \
+  template <typename T>                   \
+  using name##TripleGradFunctor = phi::funcs::name##TripleGradFunctor<T>;
+
 USE_PHI_FUNCTOR(Cos)
 USE_PHI_FUNCTOR(Tan)
 USE_PHI_FUNCTOR(Acos)
@@ -264,181 +263,27 @@ USE_PHI_FUNCTOR(Cosh)
 USE_PHI_FUNCTOR(Asinh)
 USE_PHI_FUNCTOR(Acosh)
 USE_PHI_FUNCTOR(Atanh)
-
-template <typename T>
-struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * out * (static_cast<T>(1) - out);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-/*
-    Out
-    DOut -> SigmoidGradGrad -> DOutNew
-    DDX                        DDOut
-
-    DDOut = (1-Out)*Out*DDX
-    DOutNew = (1-2*Out)*DOut*DDX
-*/
-template <typename T>
-struct SigmoidGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, const framework::Tensor* dOut,
-                  framework::Tensor* dOutNew, framework::Tensor* ddOut) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad"));
-
-    if (dOutNew) {
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad"));
-      auto dout_new = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SigmoidGradGrad"));
-      dout_new.device(*d) =
-          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * ddx;
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SigmoidGradGrad"));
-      ddout.device(*d) = (static_cast<T>(1) - out) * out * ddx;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-/*
-    Out
-    DOut                            D_Dout
-    DDx     -> SigmoidTripleGrad -> D_DDx
-    D_DDout                         d_OutNew
-    D_Dout_new
-
-    D_Dout = (1-2*Out)*DDx*D_Dout_new
-    D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new
-    D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new
-
-    Out, DDX, DOut, D_DDOut, D_DOut_New   // input
-    D_OutNew, D_DOut, D_DDx               // output
-*/
-template <typename T>
-struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, const framework::Tensor* dOut,
-                  const framework::Tensor* d_DDOut,
-                  const framework::Tensor* d_dOut_New,
-                  framework::Tensor* d_d_Out, framework::Tensor* d_Out_New,
-                  framework::Tensor* d_DDx) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidTripleGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidTripleGrad"));
-    auto dout = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidTripleGrad"));
-    auto d_ddOut = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad"));
-    auto d_dOutNew = framework::EigenVector<T>::Flatten(GET_DATA_SAFELY(
-        d_dOut_New, "Input", "D_DOut_New", "SigmoidTripleGrad"));
-
-    if (d_Out_New) {
-      auto d_OutNew = framework::EigenVector<T>::Flatten(GET_DATA_SAFELY(
-          d_Out_New, "Output", "D_OutNew", "SigmoidTripleGrad"));
-      d_OutNew.device(*d) = (ddx - static_cast<T>(2) * out * ddx) * d_ddOut -
-                            static_cast<T>(2) * dout * ddx * d_dOutNew;
-    }
-    if (d_d_Out) {
-      auto d_dOut = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "SigmoidTripleGrad"));
-      d_dOut.device(*d) =
-          (static_cast<T>(1) - static_cast<T>(2) * out) * ddx * d_dOutNew;
-    }
-    if (d_DDx) {
-      auto d_ddx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "SigmoidTripleGrad"));
-      d_ddx.device(*d) =
-          (static_cast<T>(1) - out) * out * d_ddOut +
-          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * d_dOutNew;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-// silu(x) = x / (1 + exp(-x))
-template <typename T>
-struct SiluFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
-    out.device(d) = x * temp;
-  }
-};
-
-// silu'(x) = (1 / (1 + e^{-x}))  * (1 + out * e^{-x}))
-template <typename T>
-struct SiluGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 = static_cast<T>(1) + (-x).exp();  // 1+e^(-x)
-    auto temp2 = x * (-x).exp();                  // x*e^(-x)
-    dx.device(d) = dout * ((static_cast<T>(1) / temp1) *
-                           (static_cast<T>(1) + (temp2 / temp1)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-// Originally: logsigmoid(x) = -log (1 + exp(-x))
-// For numerical stability, we can use the log-sum-exp trick:
-// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
-// We can rewrite the above equation as:
-// out = -log( exp(0) + exp(-x)) [since exp(0) = 1]
-//   = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
-//   = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
-//           max(-x, 0)))
-//   = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
-//   = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))
-//
-// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0))
-// + exp(-x - max(-x, 0))))
-template <typename T>
-struct LogSigmoidFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
-    out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log());
-  }
-};
-
-// Originally: f' = exp(-x) / (1 + exp(-x))
-// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) +
-// exp(-x - max(-x, 0)))
-template <typename T>
-struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
-    dx.device(d) =
-        dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
+USE_PHI_FUNCTOR(Tanh)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(Tanh)
+USE_PHI_TRIPLE_GRAD_FUNCTOR(Tanh)
+USE_PHI_FUNCTOR(BRelu)
+USE_PHI_FUNCTOR(ThresholdedRelu)
+USE_PHI_FUNCTOR(LeakyRelu)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu)
+USE_PHI_FUNCTOR(HardShrink)
+USE_PHI_FUNCTOR(SoftShrink)
+USE_PHI_FUNCTOR(TanhShrink)
+USE_PHI_FUNCTOR(Silu)
+USE_PHI_FUNCTOR(ELU)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(ELU)
+USE_PHI_FUNCTOR(Sigmoid)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(Sigmoid)
+USE_PHI_TRIPLE_GRAD_FUNCTOR(Sigmoid)
+USE_PHI_FUNCTOR(LogSigmoid)
+USE_PHI_FUNCTOR(HardSigmoid)
+
+template <typename T>
+using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor<T>;
 
 // exp(x) = e^x
 template <typename T>
@@ -497,210 +342,6 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor<T>;
 template <typename T>
 using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor<T>;
 
-// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-template <typename T>
-struct TanhFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.tanh();
-  }
-};
-
-template <typename T>
-struct TanhGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (static_cast<T>(1) - out * out);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-template <typename T>
-struct TanhGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, const framework::Tensor* dOut,
-                  framework::Tensor* dOutNew, framework::Tensor* ddOut) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad"));
-    // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out
-    // * ddx)
-    if (dOutNew) {
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad"));
-      auto dout_new = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad"));
-      dout_new.device(*d) =
-          static_cast<T>(-1) * dout * static_cast<T>(2) * out * ddx;
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad"));
-      ddout.device(*d) = (static_cast<T>(1) - out * out) * ddx;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-/*
-    Out
-    DOut                            D_Dout
-    DDx     -> TanhTripleGrad ->    D_DDx
-    D_DDout                         d_OutNew
-    D_Dout_new
-
-    D_Dout = (-2) * Out * DDx * D_Dout_new
-    D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new
-    D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new
-
-    Out, DDX, DOut, D_DDOut, D_DOut_New   // input
-    D_OutNew, D_DOut, D_DDx               // output
-*/
-template <typename T>
-struct TanhTripleGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, const framework::Tensor* dOut,
-                  const framework::Tensor* d_DDOut,
-                  const framework::Tensor* d_dOut_New,
-                  framework::Tensor* d_d_Out, framework::Tensor* d_Out_New,
-                  framework::Tensor* d_DDx) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad"));
-    auto dout = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad"));
-    auto d_ddOut = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad"));
-    auto d_dOutNew = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad"));
-
-    if (d_Out_New) {
-      auto d_OutNew = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad"));
-      d_OutNew.device(*d) = (static_cast<T>(-2) * out * ddx * d_ddOut) -
-                            (static_cast<T>(2) * dout * ddx * d_dOutNew);
-    }
-    if (d_d_Out) {
-      auto d_dOut = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad"));
-      d_dOut.device(*d) = static_cast<T>(-2) * out * ddx * d_dOutNew;
-    }
-    if (d_DDx) {
-      auto d_ddx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad"));
-      d_ddx.device(*d) = (static_cast<T>(1) - (out * out)) * d_ddOut -
-                         static_cast<T>(2) * out * dout * d_dOutNew;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-// tanhshrink(x) = x - tanh(x)
-// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-template <typename T>
-struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x - x.tanh();
-  }
-};
-
-template <typename T>
-struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (x.tanh() * x.tanh());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-// tanhshrink(x) = x - tanh(x)
-// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-template <typename T>
-struct HardShrinkFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp1 = x < static_cast<T>(threshold * -1.f);
-    auto temp2 = x > static_cast<T>(threshold);
-    out.device(d) = x * (temp1 || temp2).template cast<T>();
-  }
-};
-
-template <typename T>
-struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 = x < static_cast<T>(threshold * -1.f);
-    auto temp2 = x > static_cast<T>(threshold);
-    dx.device(d) = dout * (temp1 || temp2).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
-// otherwise
-template <typename T>
-struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
-  float lambda;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto lambdaT = static_cast<T>(lambda);
-    auto temp1 = (x > lambdaT).template cast<T>();
-    auto temp2 = (x < -lambdaT).template cast<T>();
-    out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
-  }
-};
-
-template <typename T>
-struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
-  float lambda;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto lambdaT = static_cast<T>(lambda);
-    auto temp1 = (x > lambdaT).template cast<T>();
-    auto temp2 = (x < -lambdaT).template cast<T>();
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 // sqrt(x) = x^(1/2)
 template <typename T>
 struct SqrtFunctor : public BaseActivationFunctor<T> {
@@ -909,42 +550,6 @@ struct SquareGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename T>
-struct BReluFunctor : public BaseActivationFunctor<T> {
-  float t_min;
-  float t_max;
-
-  // NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
-  // not polymorphism for speed.
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) =
-        x.cwiseMax(static_cast<T>(t_min)).cwiseMin(static_cast<T>(t_max));
-  }
-};
-
-template <typename T>
-struct BReluGradFunctor : public BaseActivationFunctor<T> {
-  float t_min;
-  float t_max;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout *
-                   ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
-                       .template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 // relu6(x) = min(max(0, x), 6)
 template <typename T>
 struct Relu6Functor : public BaseActivationFunctor<T> {
@@ -1168,116 +773,28 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-template <typename T>
-struct LeakyReluFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
+template <typename DeviceContext, typename T>
+class ELUGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Out = context.Input<framework::Tensor>("Out");
+    auto* dOut =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    const float alpha = context.Attr<float>("alpha");
+    dX->mutable_data<T>(context.GetPlace());
 
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    if (alpha < 1.f) {
-      out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
-    } else {
-      out.device(d) = x.cwiseMin(static_cast<T>(alpha) * x);
-    }
-  }
-};
-
-template <typename T>
-struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 =
-        static_cast<T>(alpha) * (x < static_cast<T>(0)).template cast<T>();
-    auto temp2 = (x >= static_cast<T>(0)).template cast<T>();
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct ELUFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) =
-        (x < static_cast<T>(0))
-            .select(static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)), x);
-  }
-};
-
-template <typename T>
-struct ELUGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    // case 1: alpha >= 0
-    // dx = dout, if out > 0
-    // dx = dout * (out + alpha), if out <= 0
-    dx.device(d) = (out > static_cast<T>(0))
-                       .select(dout, dout * (out + static_cast<T>(alpha)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    // case 2: alpha < 0
-    // dx = dout, if x > 0
-    // dx = dout * (out + alpha), if x <=0
-    dx.device(d) = (x > static_cast<T>(0))
-                       .select(dout, dout * static_cast<T>(alpha) * x.exp());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename DeviceContext, typename T>
-class ELUGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Out = context.Input<framework::Tensor>("Out");
-    auto* dOut =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    const float alpha = context.Attr<float>("alpha");
-    dX->mutable_data<T>(context.GetPlace());
-
-    auto x = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(X, "Input", "X", "elu_grad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Input", "Out", "elu_grad"));
-    auto dout = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(dOut, "Input", "dOut", "elu_grad"));
-    auto dx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(dX, "Output", "dX", "elu_grad"));
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
+    auto x = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "elu_grad"));
+    auto out = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "elu_grad"));
+    auto dout = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(dOut, "Input", "dOut", "elu_grad"));
+    auto dx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(dX, "Output", "dX", "elu_grad"));
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
 
     if (alpha > 0) {
       ELUGradFunctor<T> functor;
@@ -1430,74 +947,6 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename T>
-struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto th = static_cast<T>(threshold);
-    out.device(d) = (x > th).template cast<T>() * x;
-  }
-};
-
-template <typename T>
-struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto th = static_cast<T>(threshold);
-    dx.device(d) = dout * (x > th).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
-  float slope;
-  float offset;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp = x * static_cast<T>(slope) + static_cast<T>(offset);
-    out.device(d) =
-        temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
-  }
-};
-
-template <typename T>
-struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  float slope;
-  float offset;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout *
-                   ((out > static_cast<T>(0)) * (out < static_cast<T>(1)))
-                       .template cast<T>() *
-                   static_cast<T>(slope);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
 template <typename T>
 struct SwishFunctor : public BaseActivationFunctor<T> {
   float beta;
@@ -1531,121 +980,6 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-/*
- * in arguments: x, out, ddx
- * out arguments: ddout, dout, dx
- */
-template <ActBwdOpFwdDeps kDepValue>
-inline void ExtractActivationDoubleGradTensor(
-    const framework::ExecutionContext& ctx, const framework::Tensor** X,
-    const framework::Tensor** Out, const framework::Tensor** ddX,
-    framework::Tensor** dX, framework::Tensor** dOut,
-    framework::Tensor** ddOut) {
-  auto ddx_var = ctx.InputVar("DDX");
-  auto ddo_var = ctx.OutputVar("DDOut");
-  PADDLE_ENFORCE_NOT_NULL(
-      ddx_var, platform::errors::NotFound(
-                   "Cannot get input Variable Out, variable name = %s",
-                   ctx.InputName("DDX")));
-  if (CanBeUsedBySelectedRows.count(ctx.Type())) {
-    *ddX = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*ddx_var);
-    if (ddo_var) {
-      *ddOut = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
-          ddo_var);
-    }
-  } else {
-    *ddX = ctx.Input<framework::Tensor>("DDX");
-    if (ddo_var) {
-      *ddOut = ctx.Output<framework::Tensor>("DDOut");
-    }
-  }
-  PADDLE_ENFORCE_NOT_NULL(
-      *ddX,
-      platform::errors::NotFound(
-          "Cannot get the tensor from the Variable Output, variable name = %s",
-          ctx.OutputName("DDX")));
-
-  if (static_cast<int>(kDepValue) & static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
-    auto x_var = ctx.InputVar("X");
-    PADDLE_ENFORCE_NOT_NULL(
-        x_var, platform::errors::NotFound(
-                   "Cannot get input Variable Out, variable name = %s",
-                   ctx.InputName("X")));
-    auto dx_var = ctx.OutputVar("DX");
-    if (CanBeUsedBySelectedRows.count(ctx.Type())) {
-      *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var);
-      if (dx_var) {
-        *dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
-            dx_var);
-      }
-    } else {
-      *X = ctx.Input<framework::Tensor>("X");
-      if (dx_var) {
-        *dX = ctx.Output<framework::Tensor>("DX");
-      }
-    }
-  } else {
-    VLOG(10) << "Inplace activation of Op: " << ctx.Type();
-    *X = *ddX;
-  }
-  if (static_cast<int>(kDepValue) &
-      static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
-    auto out_var = ctx.InputVar("Out");
-    PADDLE_ENFORCE_NOT_NULL(
-        out_var,
-        platform::errors::NotFound(
-            "Cannot get the tensor from the Variable Out, variable name = %s",
-            ctx.InputName("Out")));
-    auto dout_var = ctx.OutputVar("DOut");
-    if (CanBeUsedBySelectedRows.count(ctx.Type())) {
-      *Out =
-          paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var);
-      if (dout_var) {
-        *dOut =
-            paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
-                dout_var);
-      }
-    } else {
-      *Out = ctx.Input<framework::Tensor>("Out");
-      if (dout_var) {
-        *dOut = ctx.Output<framework::Tensor>("DOut");
-      }
-    }
-  } else {
-    VLOG(10) << "Inplace activation of Op: " << ctx.Type();
-    *Out = *ddX;
-  }
-}
-
-template <typename DeviceContext, typename Functor>
-class ActivationDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *X, *Out, *ddX;
-    X = Out = ddX = nullptr;
-    framework::Tensor *ddOut, *dOut, *dX;
-    ddOut = dOut = dX = nullptr;
-
-    ExtractActivationDoubleGradTensor<Functor::FwdDeps()>(ctx, &X, &Out, &ddX,
-                                                          &dX, &dOut, &ddOut);
-
-    if (ddOut) ddOut->mutable_data<T>(ctx.GetPlace());
-    if (dOut) dOut->mutable_data<T>(ctx.GetPlace());
-    if (dX) dX->mutable_data<T>(Out->dims(), ctx.GetPlace());
-
-    auto& place = ctx.template device_context<DeviceContext>();
-
-    Functor functor;
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = ctx.Attr<float>(attr.first);
-    }
-    functor(place, X, Out, ddX, ddOut, dOut, dX);
-  }
-};
-
 template <typename T>
 struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device>
@@ -1667,73 +1001,6 @@ struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename T>
-struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* Out, const framework::Tensor* ddX,
-                  framework::Tensor* ddOut, framework::Tensor* dOut,
-                  framework::Tensor* dX) const {
-    if (ddOut) {
-      auto* d = dev.eigen_device();
-      auto ddx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad"));
-      auto x = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad"));
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad"));
-      ddout.device(*d) =
-          ddx *
-          ((x > static_cast<T>(0)).template cast<T>() +
-           static_cast<T>(alpha) * (x <= static_cast<T>(0)).template cast<T>())
-              .template cast<T>();
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* ddX, framework::Tensor* ddOut,
-                  const framework::Tensor* dOut, framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "ELUGradGrad"));
-    auto x = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(X, "Input", "X", "ELUGradGrad"));
-
-    if (dX) {
-      auto dx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dX, "Output", "DX", "ELUGradGrad"));
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Output", "DOut", "ELUGradGrad"));
-      dx.device(*d) = ddx * dout * static_cast<T>(alpha) * x.exp() *
-                      (x <= static_cast<T>(0)).template cast<T>();
-    }
-
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad"));
-      ddout.device(*d) = ddx *
-                         ((x > static_cast<T>(0)).template cast<T>() +
-                          static_cast<T>(alpha) * x.exp() *
-                              (x <= static_cast<T>(0)).template cast<T>())
-                             .template cast<T>();
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct CELUGradGradFunctor : public BaseActivationFunctor<T> {
   float alpha;
@@ -1907,211 +1174,6 @@ inline void ExtractDoubleGradTensorWithInputDOut(
   }
 }
 
-template <typename DeviceContext, typename Functor>
-class SigmoidDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *ddX, *dOut;
-    framework::Tensor *dOutNew, *ddOut;
-    Out = ddX = dOut = nullptr;
-    dOutNew = ddOut = nullptr;
-    // extract ddx(input) and out(input)
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    Out = ctx.Input<framework::Tensor>("Out");
-    PADDLE_ENFORCE_NOT_NULL(
-        ddX, platform::errors::NotFound(
-                 "Cannot get input Variable ddX, variable name = %s",
-                 ctx.InputName("DDX")));
-    PADDLE_ENFORCE_NOT_NULL(
-        Out, platform::errors::NotFound(
-                 "Cannot get input Variable Out, variable name = %s",
-                 ctx.InputName("Out")));
-    // set output ddout
-    ddOut = ctx.Output<framework::Tensor>("DDOut");
-    // extract dOut(intput)
-    dOut = ctx.Input<framework::Tensor>("DOut");
-    PADDLE_ENFORCE_NOT_NULL(
-        dOut, platform::errors::NotFound(
-                  "Cannot get input Variable dOut, variable name = %s",
-                  ctx.InputName("DOut")));
-    dOutNew = ctx.Output<framework::Tensor>("DOutNew");
-    if (dOutNew) dOutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    functor(place, Out, ddX, dOut, dOutNew, ddOut);
-  }
-};
-
-// Out, DDX, DOut, D_DDOut, D_DOut_New   // input
-// D_OutNew, D_DOut, D_DDx               // output
-template <typename DeviceContext, typename Functor>
-class SigmoidTripleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew;
-    framework::Tensor *d_OutNew, *d_dOut, *d_ddx;
-    Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr;
-    d_OutNew = d_dOut = d_ddx = nullptr;
-
-    // extract ddx(input), out(input), dOut(input), d_ddOut(input),
-    // d_dOutNew(input)
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    Out = ctx.Input<framework::Tensor>("Out");
-    dOut = ctx.Input<framework::Tensor>("DOut");
-    d_ddOut = ctx.Input<framework::Tensor>("D_DDOut");
-    d_dOutNew = ctx.Input<framework::Tensor>("D_DOut_New");
-
-    PADDLE_ENFORCE_NOT_NULL(
-        ddX, platform::errors::NotFound(
-                 "Cannot get input Variable ddX, variable name = %s",
-                 ctx.InputName("DDX")));
-    PADDLE_ENFORCE_NOT_NULL(
-        Out, platform::errors::NotFound(
-                 "Cannot get input Variable Out, variable name = %s",
-                 ctx.InputName("Out")));
-    PADDLE_ENFORCE_NOT_NULL(
-        dOut, platform::errors::NotFound(
-                  "Cannot get input Variable dOut, variable name = %s",
-                  ctx.InputName("DOut")));
-    PADDLE_ENFORCE_NOT_NULL(
-        d_ddOut, platform::errors::NotFound(
-                     "Cannot get input Variable d_ddOut, variable name = %s",
-                     ctx.InputName("D_DDOut")));
-    PADDLE_ENFORCE_NOT_NULL(
-        d_dOutNew,
-        platform::errors::NotFound(
-            "Cannot get input Variable d_dOutNew, variable name = %s",
-            ctx.InputName("D_DOutNew")));
-
-    // set output d_OutNew、d_dOut、d_ddx
-    d_dOut = ctx.Output<framework::Tensor>("D_DOut");
-    d_OutNew = ctx.Output<framework::Tensor>("D_OutNew");
-    d_ddx = ctx.Output<framework::Tensor>("D_DDx");
-
-    if (d_dOut) d_dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (d_OutNew) d_OutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (d_ddx) d_ddx->mutable_data<T>(ddX->dims(), ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew,  // input
-            d_dOut, d_OutNew, d_ddx);                   // output
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class TanhDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *ddX, *dOut;
-    framework::Tensor *dOutNew, *ddOut;
-    Out = ddX = dOut = nullptr;
-    dOutNew = ddOut = nullptr;
-
-    // extract ddx(input) and out(input)
-    auto ddx_var = ctx.InputVar("DDX");
-    auto out_var = ctx.InputVar("Out");
-    PADDLE_ENFORCE_NOT_NULL(
-        ddx_var, platform::errors::NotFound(
-                     "Cannot get input Variable ddx, variable name = %s",
-                     ctx.InputName("DDX")));
-    PADDLE_ENFORCE_NOT_NULL(
-        out_var, platform::errors::NotFound(
-                     "Cannot get input Variable out, variable name = %s",
-                     ctx.InputName("Out")));
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    Out = ctx.Input<framework::Tensor>("Out");
-
-    // set output ddout
-    auto ddout_var = ctx.OutputVar("DDOut");
-    if (ddout_var) {
-      ddOut = ctx.Output<framework::Tensor>("DDOut");
-    }
-
-    // extract dOut(intput)
-    auto dout_var = ctx.InputVar("DOut");
-    PADDLE_ENFORCE_NOT_NULL(
-        dout_var, platform::errors::NotFound(
-                      "Cannot get input Variable dout_var, variable name = %s",
-                      ctx.InputName("DOut")));
-    dOut = ctx.Input<framework::Tensor>("DOut");
-
-    // set output dout_new
-    auto dout_new_var = ctx.OutputVar("DOutNew");
-    if (dout_new_var) {
-      dOutNew = ctx.Output<framework::Tensor>("DOutNew");
-    }
-
-    if (dOutNew) dOutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    functor(place, Out, ddX, dOut, dOutNew, ddOut);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class TanhTripeGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew;
-    framework::Tensor *d_OutNew, *d_dOut, *d_ddx;
-    Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr;
-    d_OutNew = d_dOut = d_ddx = nullptr;
-
-    // extract ddx(input), out(input), dOut(input), d_ddOut(input),
-    // d_dOutNew(input)
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    Out = ctx.Input<framework::Tensor>("Out");
-    dOut = ctx.Input<framework::Tensor>("DOut");
-    d_ddOut = ctx.Input<framework::Tensor>("D_DDOut");
-    d_dOutNew = ctx.Input<framework::Tensor>("D_DOut_New");
-
-    PADDLE_ENFORCE_NOT_NULL(
-        ddX, platform::errors::NotFound(
-                 "Cannot get input Variable ddX, variable name = %s",
-                 ctx.InputName("DDX")));
-    PADDLE_ENFORCE_NOT_NULL(
-        Out, platform::errors::NotFound(
-                 "Cannot get input Variable Out, variable name = %s",
-                 ctx.InputName("Out")));
-    PADDLE_ENFORCE_NOT_NULL(
-        dOut, platform::errors::NotFound(
-                  "Cannot get input Variable dOut, variable name = %s",
-                  ctx.InputName("DOut")));
-    PADDLE_ENFORCE_NOT_NULL(
-        d_ddOut, platform::errors::NotFound(
-                     "Cannot get input Variable d_ddOut, variable name = %s",
-                     ctx.InputName("D_DDOut")));
-    PADDLE_ENFORCE_NOT_NULL(
-        d_dOutNew,
-        platform::errors::NotFound(
-            "Cannot get input Variable d_dOutNew, variable name = %s",
-            ctx.InputName("D_DOutNew")));
-
-    // set output d_OutNew、d_dOut、d_ddx
-    d_dOut = ctx.Output<framework::Tensor>("D_DOut");
-    d_OutNew = ctx.Output<framework::Tensor>("D_OutNew");
-    d_ddx = ctx.Output<framework::Tensor>("D_DDx");
-
-    if (d_dOut) d_dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (d_OutNew) d_OutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (d_ddx) d_ddx->mutable_data<T>(ddX->dims(), ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew,  // input
-            d_dOut, d_OutNew, d_ddx);                   // output
-  }
-};
-
 template <typename DeviceContext, typename Functor>
 class SquareDoubleGradKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -2493,29 +1555,19 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
 }  // namespace operators
 }  // namespace paddle
 
-#define FOR_EACH_ACTIVATION_OP(__macro)                                       \
-  __macro(silu, Silu, SiluFunctor, SiluGradFunctor);                          \
-  __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
-  __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
-  __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                          \
-  __macro(floor, Floor, FloorFunctor, ZeroGradFunctor);                       \
-  __macro(round, Round, RoundFunctor, ZeroGradFunctor);                       \
-  __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
-  __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);                      \
-  __macro(log2, Log2, Log2Functor, Log2GradFunctor);                          \
-  __macro(log10, Log10, Log10Functor, Log10GradFunctor);                      \
-  __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor);                      \
-  __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);         \
-  __macro(stanh, STanh, STanhFunctor, STanhGradFunctor);                      \
-  __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor);          \
-  __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor);          \
-  __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor);                      \
-  __macro(tanh_shrink, TanhShrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \
-  __macro(hard_shrink, HardShrink, HardShrinkFunctor, HardShrinkGradFunctor); \
-  __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,                      \
-          HardSigmoidGradFunctor);                                            \
-  __macro(swish, Swish, SwishFunctor, SwishGradFunctor);                      \
-  __macro(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor,          \
-          ThresholdedReluGradFunctor);                                        \
-  __macro(mish, Mish, MishFunctor, MishGradFunctor);                          \
+#define FOR_EACH_ACTIVATION_OP(__macro)                                      \
+  __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                         \
+  __macro(floor, Floor, FloorFunctor, ZeroGradFunctor);                      \
+  __macro(round, Round, RoundFunctor, ZeroGradFunctor);                      \
+  __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
+  __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);                     \
+  __macro(log2, Log2, Log2Functor, Log2GradFunctor);                         \
+  __macro(log10, Log10, Log10Functor, Log10GradFunctor);                     \
+  __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);        \
+  __macro(stanh, STanh, STanhFunctor, STanhGradFunctor);                     \
+  __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor);         \
+  __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor);         \
+  __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor);                     \
+  __macro(swish, Swish, SwishFunctor, SwishGradFunctor);                     \
+  __macro(mish, Mish, MishFunctor, MishGradFunctor);                         \
   __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor);
diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps
index 208abd0949aa8..7c1b288080162 100644
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -15,170 +15,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // leakyrelu(x) = x > 0 ? x : alpha * x
-  __device__ __forceinline__ T operator()(const T x) const {
-    return x > zero ? x : static_cast<T>(alpha) * x;
-  }
-};
-
-template <typename T>
-struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // dx = dout * (x > 0 ? 1 : alpha)
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return x > zero ? dout : static_cast<T>(alpha) * dout;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // sigmoid(x) = 1 / (1 + exp(-x))
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(one / (one + exp(-x)));
-  }
-};
-
-template <typename T>
-struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
-
-  // dx = dout * out * (1 - out)
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return dout * out * (one - out);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-template <typename T>
-struct CudaSiluFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // silu(x) = x / (1 + exp(-x))
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(x / (one + exp(-x)));
-  }
-};
-
-template <typename T>
-struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType temp = one / (one + exp(-x));
-    return static_cast<T>(dout * (temp * (one + x * (one - temp))));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-
-  // logsigmoid(x) = log(1 / (1 + exp(-x)))
-  // For numerical stability,
-  // logsigmoid(x) =
-  //          - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    MPType temp = x > zero ? zero : -x;
-    return static_cast<T>(-temp - log(exp(-temp) + exp(-x - temp)));
-  }
-};
-
-template <typename T>
-struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-
-  // dx = dout * exp(-x) / (1 + exp(-x))
-  // For numerical stability:
-  // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x,
-  // 0)))
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType temp1 = x > zero ? zero : -x;
-    MPType temp2 = exp(-x - temp1);
-    return static_cast<T>(dout * (temp2 / (exp(-temp1) + temp2)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaSoftShrinkFunctor : public BaseActivationFunctor<T> {
-  float lambda;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-
-  // softshrink(x) = x - lambda, if x > lambda;
-  //                 x + lambda, if x < -lambda;
-  //                 0, otherwise.
-  __device__ __forceinline__ T operator()(const T x) const {
-    T l = static_cast<T>(lambda);
-    T temp1 = static_cast<T>(x > l);
-    T temp2 = static_cast<T>(x < -l);
-    return temp1 * (x - l) + temp2 * (x + l);
-  }
-};
-
-template <typename T>
-struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float lambda;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-
-  // dx = dout, if x > lambda or x < -lambda else 0
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    T l = static_cast<T>(lambda);
-    return (x >= -l && x <= l) ? zero : dout;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct CudaCeilFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
@@ -224,31 +65,6 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-template <typename T>
-struct CudaTanhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // tanh(x) = tanh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(tanh(x));
-  }
-};
-
-template <typename T>
-struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
-
-  // dx = dout * (1 - out^2)
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return dout * (one - out * out);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
 template <typename T>
 struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
@@ -476,45 +292,6 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename T>
-struct CudaBReluFunctor : public BaseActivationFunctor<T> {
-  float t_min;
-  float t_max;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-
-  // brelu(x) = min(max(x, t_min), t_max)
-  __device__ __forceinline__ T operator()(const T x) const {
-    T t_min_cast = static_cast<T>(t_min);
-    T t_max_cast = static_cast<T>(t_max);
-    T temp_max = x > t_min_cast ? x : t_min_cast;
-    T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast;
-    return temp_min;
-  }
-};
-
-template <typename T>
-struct CudaBReluGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float t_min;
-  float t_max;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-
-  // dx = (x > t_min && x < t_max) ? dout : 0
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    T t_min_cast = static_cast<T>(t_min);
-    T t_max_cast = static_cast<T>(t_max);
-    return (x > t_min_cast && x < t_max_cast) ? dout : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct CudaSoftReluFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
@@ -711,109 +488,6 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-template <typename T>
-struct CudaTanhShrinkFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // tanhshrink(x) = x - tanh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(x - tanh(x));
-  }
-};
-
-template <typename T>
-struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout * tanh(x)^2
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * tanh(x) * tanh(x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaHardShrinkFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x
-  __device__ __forceinline__ T operator()(const T x) const {
-    T t = static_cast<T>(threshold);
-    return (x > -t && x < t) ? zero : x;
-  }
-};
-
-template <typename T>
-struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  // dx = (x > -threshold && x < threshold) ? 0 : dout
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    T t = static_cast<T>(threshold);
-    return (x > -t && x < t) ? zero : dout;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  T one = static_cast<T>(1.0f);
-  float slope;
-  float offset;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-
-  // hard_sigmoid(x) = 0, when x <= -3
-  //                   1, when x >= 3
-  //                   x * slope + offset, otherwise
-  __device__ __forceinline__ T operator()(const T x) const {
-    T temp = x * static_cast<T>(slope) + static_cast<T>(offset);
-    T temp_max = temp > zero ? temp : zero;
-    T temp_min = temp_max < one ? temp_max : one;
-    return temp_min;
-  }
-};
-
-template <typename T>
-struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  T one = static_cast<T>(1.0f);
-  float slope;
-  float offset;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-
-  // dx = (out > 0 && out < 1) ? dout * slope : 0
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return (out > zero && out < one) ? dout * static_cast<T>(slope) : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
 template <typename T>
 struct CudaSwishFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
@@ -907,38 +581,6 @@ struct CudaMishGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename T>
-struct CudaThresholdedReluFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  // thresholded_relu(x) = x > threshold ? x : 0
-  __device__ __forceinline__ T operator()(const T x) const {
-    return x > static_cast<T>(threshold) ? x : zero;
-  }
-};
-
-template <typename T>
-struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  // dx = x > threshold ? dout : 0
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return x > static_cast<T>(threshold) ? dout : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct CudaHardSwishFunctor : public BaseActivationFunctor<T> {
   T zero = static_cast<T>(0.0f);
@@ -991,110 +633,6 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename T>
-struct CudaELUFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT zero = static_cast<CT>(0.0f);
-  CT one = static_cast<CT>(1.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // elu(x) = x, if x > 0
-  // elu(x) = alpha * (e^x - 1), if x <= 0
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    CT x = static_cast<CT>(arg_x);
-    CT temp = static_cast<CT>(alpha) * (exp(x) - one);
-    CT res = x > zero ? x : temp;
-    return static_cast<T>(res);
-  }
-};
-
-template <typename T>
-struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // case 1: alpha >= 0
-  // dx = dout, if out > 0
-  // dx = dout * (out + alpha), if out <= 0
-  __device__ __forceinline__ T operator()(T arg_dout, T arg_out) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType out = static_cast<MPType>(arg_out);
-    MPType a = static_cast<MPType>(alpha);
-    MPType out_pos = static_cast<MPType>(out > zero);
-    MPType out_neg = static_cast<MPType>(out <= zero);
-    return static_cast<T>(dout * (out_pos + out_neg * (out + a)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-template <typename T>
-struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // case 2: alpha < 0
-  // dx = dout, if x > 0
-  // dx = dout * (out + alpha), if x <=0
-  __device__ __forceinline__ T operator()(const T arg_dout, const T arg_out,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType out = static_cast<MPType>(arg_out);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType a = static_cast<MPType>(alpha);
-    MPType x_pos = static_cast<MPType>(x > zero);
-    MPType x_neg = static_cast<MPType>(x <= zero);
-    return static_cast<T>(dout * (x_pos + x_neg * (out + a)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename DeviceContext, typename T>
-class ELUGradCudaKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* out = ctx.Input<framework::Tensor>("Out");
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    d_x->mutable_data<T>(ctx.GetPlace());
-    const float alpha = ctx.Attr<float>("alpha");
-
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-    std::vector<const framework::Tensor*> ins = {d_out, out};
-    std::vector<framework::Tensor*> outs = {d_x};
-    if (alpha > 0) {
-      CudaELUGradFunctor<T> functor;
-      functor.alpha = alpha;
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                                &outs, functor);
-    } else {
-      CudaELUGradNegativeAlphaFunctor<T> functor;
-      functor.alpha = alpha;
-      ins.push_back(x);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                                &outs, functor);
-    }
-  }
-};
-
 template <typename T>
 struct CudaCELUFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
@@ -1212,6 +750,34 @@ class ActivationGradCudaKernel
   }
 };
 
+USE_PHI_FUNCTOR(CudaCos)
+USE_PHI_FUNCTOR(CudaTan)
+USE_PHI_FUNCTOR(CudaAcos)
+USE_PHI_FUNCTOR(CudaSin)
+USE_PHI_FUNCTOR(CudaAsin)
+USE_PHI_FUNCTOR(CudaAtan)
+USE_PHI_FUNCTOR(CudaSinh)
+USE_PHI_FUNCTOR(CudaCosh)
+USE_PHI_FUNCTOR(CudaAsinh)
+USE_PHI_FUNCTOR(CudaAcosh)
+USE_PHI_FUNCTOR(CudaAtanh)
+USE_PHI_FUNCTOR(CudaTanh)
+USE_PHI_FUNCTOR(CudaBRelu)
+USE_PHI_FUNCTOR(CudaLeakyRelu)
+USE_PHI_FUNCTOR(CudaThresholdedRelu)
+USE_PHI_FUNCTOR(CudaHardShrink)
+USE_PHI_FUNCTOR(CudaSoftShrink)
+USE_PHI_FUNCTOR(CudaTanhShrink)
+USE_PHI_FUNCTOR(CudaSilu)
+USE_PHI_FUNCTOR(CudaELU)
+USE_PHI_FUNCTOR(CudaSigmoid)
+USE_PHI_FUNCTOR(CudaLogSigmoid)
+USE_PHI_FUNCTOR(CudaHardSigmoid)
+
+template <typename T>
+using CudaELUGradNegativeAlphaFunctor =
+    phi::funcs::CudaELUGradNegativeAlphaFunctor<T>;
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -1270,40 +836,6 @@ namespace plat = paddle::platform;
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
                                     ops::grad_functor<plat::bfloat16>>);
 
-/* ======================== leaky relu register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
-                                CudaLeakyReluGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    leaky_relu_grad_grad,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::LeakyReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::LeakyReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<
-        plat::CUDADeviceContext, ops::LeakyReluGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
-/* ======================== elu register  ============================ */
-REGISTER_OP_CUDA_KERNEL(
-    elu, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                                   ops::CudaELUFunctor<float>>,
-    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                              ops::CudaELUFunctor<double>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaELUFunctor<plat::float16>>);
-REGISTER_OP_CUDA_KERNEL(
-    elu_grad, ops::ELUGradCudaKernel<plat::CUDADeviceContext, float>,
-    ops::ELUGradCudaKernel<plat::CUDADeviceContext, double>,
-    ops::ELUGradCudaKernel<plat::CUDADeviceContext, plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    elu_grad_grad, ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
-                                            ops::ELUGradGradFunctor<float>>,
-    ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
-                             ops::ELUGradGradFunctor<double>>,
-    ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
-                             ops::ELUGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ======================== celu register  ============================ */
@@ -1319,58 +851,6 @@ REGISTER_OP_CUDA_KERNEL(
                               ops::CELUGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
-/* ===========================    sigmoid register  ============================
- */
-REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
-                                CudaSigmoidGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    sigmoid_grad_grad,
-    ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<float>>,
-    ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<double>>,
-    ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<plat::float16>>,
-    ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<plat::bfloat16>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    sigmoid_triple_grad,
-    ops::SigmoidTripleGradKernel<paddle::platform::CUDADeviceContext,
-                                 ops::SigmoidTripleGradFunctor<float>>,
-    ops::SigmoidTripleGradKernel<paddle::platform::CUDADeviceContext,
-                                 ops::SigmoidTripleGradFunctor<double>>,
-    ops::SigmoidTripleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidTripleGradFunctor<plat::float16>>,
-    ops::SigmoidTripleGradKernel<
-        plat::CUDADeviceContext,
-        ops::SigmoidTripleGradFunctor<plat::bfloat16>>);
-/* ========================================================================== */
-
-/* ===========================    tanh register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor,
-                                CudaTanhGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    tanh_grad_grad,
-    ops::TanhDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::TanhGradGradFunctor<float>>,
-    ops::TanhDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::TanhGradGradFunctor<double>>,
-    ops::TanhDoubleGradKernel<plat::CUDADeviceContext,
-                              ops::TanhGradGradFunctor<plat::float16>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    tanh_triple_grad,
-    ops::TanhTripeGradKernel<paddle::platform::CUDADeviceContext,
-                             ops::TanhTripleGradFunctor<float>>,
-    ops::TanhTripeGradKernel<paddle::platform::CUDADeviceContext,
-                             ops::TanhTripleGradFunctor<double>>,
-    ops::TanhTripeGradKernel<plat::CUDADeviceContext,
-                             ops::TanhTripleGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
 /* ===========================   sqrt register  ============================= */
 REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
                                 CudaSqrtGradFunctor);
@@ -1508,9 +988,6 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 #define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \
-  __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor);                  \
-  __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,                      \
-          CudaLogSigmoidGradFunctor);                                         \
   __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor,                      \
           CudaSoftShrinkGradFunctor);                                         \
   __macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor);                  \
@@ -1521,7 +998,6 @@ REGISTER_OP_CUDA_KERNEL(
   __macro(log1p, Log1p, CudaLog1pFunctor, CudaLog1pGradFunctor);              \
   __macro(log2, Log2, CudaLog2Functor, CudaLog2GradFunctor);                  \
   __macro(log10, Log10, CudaLog10Functor, CudaLog10GradFunctor);              \
-  __macro(brelu, BRelu, CudaBReluFunctor, CudaBReluGradFunctor);              \
   __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \
   __macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor);              \
   __macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor);  \
@@ -1531,76 +1007,228 @@ REGISTER_OP_CUDA_KERNEL(
           CudaTanhShrinkGradFunctor);                                         \
   __macro(hard_shrink, HardShrink, CudaHardShrinkFunctor,                     \
           CudaHardShrinkGradFunctor);                                         \
-  __macro(hard_sigmoid, HardSigmoid, CudaHardSigmoidFunctor,                  \
-          CudaHardSigmoidGradFunctor);                                        \
   __macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor);              \
   __macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor);                  \
-  __macro(thresholded_relu, ThresholdedRelu, CudaThresholdedReluFunctor,      \
-          CudaThresholdedReluGradFunctor);                                    \
   __macro(hard_swish, HardSwish, CudaHardSwishFunctor,                        \
           CudaHardSwishGradFunctor);
 FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)
 
 #ifdef PADDLE_WITH_XPU_KP
-#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, op_name, functor,             \
-                                       grad_functor)                           \
-  REGISTER_OP_KERNEL(                                                          \
-      act_type, KP, plat::XPUPlace,                                            \
-      ops::ActivationCudaKernel<plat::XPUDeviceContext, ops::functor<float>>); \
-  REGISTER_OP_KERNEL(act_type##_grad, KP, plat::XPUPlace,                      \
-                     ops::ActivationGradCudaKernel<plat::XPUDeviceContext,     \
-                                                   ops::grad_functor<float>>);
-
-REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
-                               CudaLeakyReluGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(relu, Relu, CudaReluFunctor,
-                               CudaReluGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
-                               CudaSigmoidGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(exp, Exp, CudaExpFunctor, CudaExpGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor,
-                               CudaReciprocalGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(softplus, Softplus, CudaSoftplusFunctor,
-                               CudaSoftplusGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, HardSwish, CudaHardSwishFunctor,
-                               CudaHardSwishGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(elu, Elu, CudaELUFunctor, CudaELUGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(celu, Celu, CudaCELUFunctor,
-                               CudaCELUGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
-                               CudaSqrtGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(square, Square, CudaSquareFunctor,
-                               CudaSquareGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(silu, Silu, CudaSiluFunctor,
-                               CudaSiluGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,
-                               CudaLogSigmoidGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor,
-                               CudaSoftShrinkGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(ceil, Ceil, CudaCeilFunctor,
-                               CudaZeroGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(floor, Floor, CudaFloorFunctor,
-                               CudaZeroGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(log1p, Log1p, CudaLog1pFunctor,
-                               CudaLog1pGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(brelu, BRelu, CudaBReluFunctor,
-                               CudaBReluGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(soft_relu, SoftRelu, CudaSoftReluFunctor,
-                               CudaSoftReluGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(softsign, Softsign, CudaSoftsignFunctor,
-                               CudaSoftsignGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(relu6, Relu6, CudaRelu6Functor,
-                               CudaRelu6GradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor,
-                               CudaHardShrinkGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(hard_sigmoid, HardSigmoid,
-                               CudaHardSigmoidFunctor,
-                               CudaHardSigmoidGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(swish, Swish, CudaSwishFunctor,
-                               CudaSwishGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(thresholded_relu, ThresholdedRelu,
-                               CudaThresholdedReluFunctor,
-                               CudaThresholdedReluGradFunctor);
+REGISTER_OP_KERNEL(
+    brelu, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              phi::funcs::CudaBReluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    brelu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  phi::funcs::CudaBReluGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(ceil, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaCeilFunctor<float>>);
+REGISTER_OP_KERNEL(
+    ceil_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaZeroGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(celu, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaCELUFunctor<float>>);
+REGISTER_OP_KERNEL(
+    celu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaCELUGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(elu, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaELUFunctor<float>>);
+REGISTER_OP_KERNEL(
+    elu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaELUGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(exp, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaExpFunctor<float>>);
+REGISTER_OP_KERNEL(
+    exp_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaExpGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(floor, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaFloorFunctor<float>>);
+REGISTER_OP_KERNEL(
+    floor_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaZeroGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    hard_shrink, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaHardShrinkFunctor<float>>);
+REGISTER_OP_KERNEL(
+    hard_shrink_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaHardShrinkGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    hard_sigmoid, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaHardSigmoidFunctor<float>>);
+REGISTER_OP_KERNEL(
+    hard_sigmoid_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaHardSigmoidGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(hard_swish, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaHardSwishFunctor<float>>);
+REGISTER_OP_KERNEL(
+    hard_swish_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaHardSwishGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    leaky_relu, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              phi::funcs::CudaLeakyReluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    leaky_relu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  phi::funcs::CudaLeakyReluGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(log, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaLogFunctor<float>>);
+REGISTER_OP_KERNEL(
+    log_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaLogGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(log1p, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaLog1pFunctor<float>>);
+REGISTER_OP_KERNEL(
+    log1p_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaLog1pGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    logsigmoid, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaLogSigmoidFunctor<float>>);
+REGISTER_OP_KERNEL(
+    logsigmoid_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaLogSigmoidGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    reciprocal, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaReciprocalFunctor<float>>);
+REGISTER_OP_KERNEL(
+    reciprocal_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaReciprocalGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    relu, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              phi::funcs::CudaReluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    relu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  phi::funcs::CudaReluGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(relu6, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaRelu6Functor<float>>);
+REGISTER_OP_KERNEL(
+    relu6_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaRelu6GradFunctor<float>>);
+
+REGISTER_OP_KERNEL(sigmoid, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSigmoidFunctor<float>>);
+REGISTER_OP_KERNEL(
+    sigmoid_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSigmoidGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(silu, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSiluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    silu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSiluGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(soft_relu, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSoftReluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    soft_relu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSoftReluGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(softplus, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSoftplusFunctor<float>>);
+REGISTER_OP_KERNEL(
+    softplus_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSoftplusGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    softshrink, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaSoftShrinkFunctor<float>>);
+REGISTER_OP_KERNEL(
+    softshrink_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSoftShrinkGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(softsign, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSoftsignFunctor<float>>);
+REGISTER_OP_KERNEL(
+    softsign_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSoftsignGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(sqrt, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSqrtFunctor<float>>);
+REGISTER_OP_KERNEL(
+    sqrt_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSqrtGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(square, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSquareFunctor<float>>);
+REGISTER_OP_KERNEL(
+    square_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSquareGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(swish, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSwishFunctor<float>>);
+REGISTER_OP_KERNEL(
+    swish_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSwishGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    thresholded_relu, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaThresholdedReluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    thresholded_relu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaThresholdedReluGradFunctor<float>>);
 
 #endif  // PADDLE_WITH_XPU_KP
diff --git a/paddle/fluid/operators/allclose_op.cc b/paddle/fluid/operators/allclose_op.cc
index 8fb9929c39e92..88d7cb7c1f5f4 100644
--- a/paddle/fluid/operators/allclose_op.cc
+++ b/paddle/fluid/operators/allclose_op.cc
@@ -12,52 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/allclose_op.h"
 #include <cmath>
 #include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct GetTensorValue<platform::CPUDeviceContext, T> {
-  T operator()(const platform::CPUDeviceContext& dev_ctx,
-               const framework::Tensor& tensor) const {
-    return *(tensor.data<T>());
-  }
-};
-
-template <typename T>
-struct AllcloseFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& other,
-                  const double rtol, const double atol, bool equal_nan,
-                  framework::Tensor* output) {
-    auto* in_a = in.data<T>();
-    auto* in_b = other.data<T>();
-    auto* out_data = output->mutable_data<bool>(ctx.GetPlace());
-    auto num = in.numel();
-    *out_data = true;
-    for (int i = 0; i < num; i++) {
-      const T a = in_a[i], b = in_b[i];
-      bool val;
-      if (std::isnan(a) || std::isnan(b)) {
-        val = equal_nan && std::isnan(a) == std::isnan(b);
-      } else {
-        T left = (a > b ? a - b : b - a);
-        T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
-        T diff = (left > right ? left - right : right - left);
-        val = a == b || left <= right || diff <= 1e-15;
-      }
-      *out_data &= val;
-    }
-  }
-};
-
 class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -96,40 +64,6 @@ class AllcloseOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Allclose");
-    OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Allclose");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Allclose");
-
-    auto input_dim = ctx->GetInputDim("Input");
-    auto other_dim = ctx->GetInputDim("Other");
-    PADDLE_ENFORCE_EQ(input_dim.size(), other_dim.size(),
-                      platform::errors::PreconditionNotMet(
-                          "Input(Input) and Input(Other) must have the same "
-                          "dimension size."));
-    int n = input_dim.size();
-    bool is_runtime = ctx->IsRuntime();
-    for (int i = 0; i < n; i++) {
-      if (is_runtime) {
-        PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
-                          platform::errors::PreconditionNotMet(
-                              "The value at dim %d of Input(Input) is not "
-                              "equal to the Input(Other): %ld != %ld.",
-                              i, input_dim[i], other_dim[i]));
-      } else {
-        if (!(input_dim[i] < 0 || other_dim[i] < 0)) {
-          PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
-                            platform::errors::PreconditionNotMet(
-                                "The value at dim %d of Input(Input) is not "
-                                "equal to the Input(Other): %ld != %ld.",
-                                i, input_dim[i], other_dim[i]));
-        }
-      }
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim({1}));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -152,13 +86,13 @@ class AllcloseOpVarTypeInference : public framework::VarTypeInference {
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 
+DECLARE_INFER_SHAPE_FUNCTOR(allclose, AllcloseInferShapeFunctor,
+                            PD_INFER_META(phi::AllValueCompareInferMeta));
 REGISTER_OPERATOR(
     allclose, ops::AllcloseOp, ops::AllcloseOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::AllcloseOpVarTypeInference);
-REGISTER_OP_CPU_KERNEL(allclose, ops::AllcloseKernel<CPU, float>,
-                       ops::AllcloseKernel<CPU, double>);
+    ops::AllcloseOpVarTypeInference, AllcloseInferShapeFunctor);
 
 /* ==========================  register checkpoint ===========================*/
 REGISTER_OP_VERSION(allclose)
diff --git a/paddle/fluid/operators/allclose_op.cu b/paddle/fluid/operators/allclose_op.cu
deleted file mode 100644
index 32c90ff8fdc10..0000000000000
--- a/paddle/fluid/operators/allclose_op.cu
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/allclose_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct GetTensorValue<platform::CUDADeviceContext, T> {
-  T operator()(const platform::CUDADeviceContext& dev_ctx,
-               const framework::Tensor& tensor) const {
-    const T* data = tensor.data<T>();
-    T value;
-    const auto gpu_place = dev_ctx.GetPlace();
-    memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T),
-                 dev_ctx.stream());
-    return value;
-  }
-};
-
-template <typename T>
-__global__ void AllcloseCUDAKernel(const T* in_data, const T* other_data,
-                                   const double rtol, const double atol,
-                                   bool equal_nan, int num, bool* out_data) {
-  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  bool val;
-  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
-    const T a = in_data[i], b = other_data[i];
-    if (isnan(a) || isnan(b)) {
-      val = equal_nan && isnan(a) == isnan(b);
-    } else {
-      T left = (a > b ? a - b : b - a);
-      T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
-      T diff = (left > right ? left - right : right - left);
-      val = a == b || left <= right || diff <= 1e-15;
-    }
-    if (!val) *out_data = false;
-  }
-}
-
-template <typename T>
-struct AllcloseFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const framework::Tensor& in, const framework::Tensor& other,
-                  const double rtol, const double atol, bool equal_nan,
-                  framework::Tensor* output) {
-    int num = in.numel();
-    const T* in_data = in.data<T>();
-    const T* other_data = other.data<T>();
-    bool* out_data = output->mutable_data<bool>(dev_ctx.GetPlace());
-    int block = 1024;
-    int grid = (block - 1 + num) / block;
-    grid = (grid > block) ? block : grid;
-#ifdef PADDLE_WITH_HIP
-    hipMemset(out_data, true, sizeof(bool));
-#else
-    cudaMemset(out_data, true, sizeof(bool));
-#endif
-    AllcloseCUDAKernel<T><<<grid, block, 0, dev_ctx.stream()>>>(
-        in_data, other_data, rtol, atol, equal_nan, num, out_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(allclose, ops::AllcloseKernel<CUDA, float>,
-                        ops::AllcloseKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/allclose_op.h b/paddle/fluid/operators/allclose_op.h
deleted file mode 100644
index 7a36754194ace..0000000000000
--- a/paddle/fluid/operators/allclose_op.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-struct GetTensorValue {
-  T operator()(const platform::DeviceContext& ctx,
-               const framework::Tensor& tensor) const;
-};
-
-template <typename DeviceContext, typename T>
-struct AllcloseFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
-                  const framework::Tensor& other, const float rtol,
-                  const float atol, bool equal_nan, framework::Tensor* output);
-};
-
-template <typename DeviceContext, typename T>
-class AllcloseKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // get attrs
-    bool equal_nan = ctx.Attr<bool>("equal_nan");
-    // get input/output
-    const auto* input = ctx.Input<Tensor>("Input");
-    const auto* other = ctx.Input<Tensor>("Other");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    double rtol_v = std::stod(ctx.Attr<std::string>("rtol"));
-    double atol_v = std::stod(ctx.Attr<std::string>("atol"));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    GetTensorValue<DeviceContext, double> get_tensor_value;
-    if (ctx.HasInput("Rtol")) {
-      const auto* rtol = ctx.Input<Tensor>("Rtol");
-      PADDLE_ENFORCE_EQ(
-          rtol->numel(), 1,
-          platform::errors::InvalidArgument(
-              "Input(Rtol) size must be 1, but get %d.", rtol->numel()));
-      PADDLE_ENFORCE_EQ(
-          framework::TransToProtoVarType(rtol->dtype()),
-          framework::proto::VarType::FP64,
-          platform::errors::InvalidArgument(
-              "Input(Rtol) type must be double, but get %s.",
-              framework::DataTypeToString(
-                  framework::TransToProtoVarType(rtol->dtype()))));
-      rtol_v = get_tensor_value(dev_ctx, *rtol);
-    }
-    if (ctx.HasInput("Atol")) {
-      const auto* atol = ctx.Input<Tensor>("Atol");
-      PADDLE_ENFORCE_EQ(
-          atol->numel(), 1,
-          platform::errors::InvalidArgument(
-              "Input(Atol) size must be 1, but get %d", atol->numel()));
-      PADDLE_ENFORCE_EQ(
-          framework::TransToProtoVarType(atol->dtype()),
-          framework::proto::VarType::FP64,
-          platform::errors::InvalidArgument(
-              "Input(Atol) type must be double, but get %s",
-              framework::DataTypeToString(
-                  framework::TransToProtoVarType(atol->dtype()))));
-      atol_v = get_tensor_value(dev_ctx, *atol);
-    }
-
-    AllcloseFunctor<DeviceContext, T>()(dev_ctx, *input, *other, rtol_v, atol_v,
-                                        equal_nan, out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
new file mode 100644
index 0000000000000..237cfcc6f1172
--- /dev/null
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
+    const auto xs = ctx.MultiInput<framework::Tensor>("X");
+    const auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
+
+    found_inf->mutable_data<bool>(dev_ctx.GetPlace());
+
+    MLUCnnlTensorDesc scale_desc(*scale);
+    MLUCnnlTensorDesc found_inf_desc(*found_inf, CNNL_LAYOUT_ARRAY,
+                                     ToCnnlDataType<bool>());
+
+    for (size_t i = 0; i < xs.size(); ++i) {
+      const auto* x = xs[i];
+      auto* out = outs[i];
+      out->mutable_data<T>(ctx.GetPlace());
+
+      // check is_finite or is_nan
+      Tensor is_finite(found_inf->type());
+      if (i != 0) {
+        is_finite.Resize(phi::make_ddim({1}));
+        is_finite.mutable_data<bool>(ctx.GetPlace());
+      } else {
+        is_finite.ShareDataWith(*found_inf);
+      }
+
+      MLUCnnlTensorDesc x_desc(*x);
+
+      MLUCnnl::IsNanInf(ctx, x_desc.get(), GetBasePtr(x),
+                        GetBasePtr(&is_finite));
+
+      // save is_finite by logical_and op after checking every input
+      if (i != 0) {
+        MLUCnnlTensorDesc is_finite_desc(is_finite, CNNL_LAYOUT_ARRAY,
+                                         ToCnnlDataType<bool>());
+        MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_OR, found_inf_desc.get(),
+                       GetBasePtr(found_inf), is_finite_desc.get(),
+                       GetBasePtr(&is_finite), found_inf_desc.get(),
+                       GetBasePtr(found_inf));
+      }
+
+      // The normal logic is :
+      // out = in, if found_inf = true
+      // out = in/scale, if found_inf = false
+      // But when found_inf is true, the data of Out should not be used.
+      // So, on MLU, we always compute out with in/scale.
+      MLUCnnlTensorDesc out_desc(*out);
+      MLUCnnl::Div(ctx, CNNL_COMPUTATION_HIGH_PRECISION, x_desc.get(),
+                   GetBasePtr(x), scale_desc.get(), GetBasePtr(scale),
+                   out_desc.get(), GetBasePtr(out));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_MLU_KERNEL(check_finite_and_unscale,
+                       ops::CheckFiniteAndUnscaleMLUKernel<float>,
+                       ops::CheckFiniteAndUnscaleMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc
index 0f5c048b6be9c..c5e4188ca2d6f 100644
--- a/paddle/fluid/operators/arg_max_op.cc
+++ b/paddle/fluid/operators/arg_max_op.cc
@@ -15,23 +15,19 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/arg_min_max_op_base.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+DECLARE_INFER_SHAPE_FUNCTOR(arg_max, ArgMaxInferShapeFunctor,
+                            PD_INFER_META(phi::ArgMinMaxInferMeta));
+
 REGISTER_OPERATOR(
     arg_max, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMaxOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    arg_max,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, float>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, double>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    uint8_t>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ArgMaxInferShapeFunctor);
+
 REGISTER_OP_VERSION(arg_max)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/arg_min_max_op_base.cu.h b/paddle/fluid/operators/arg_min_max_op_base.cu.h
deleted file mode 100644
index b77031f7fb4c9..0000000000000
--- a/paddle/fluid/operators/arg_min_max_op_base.cu.h
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#include <limits>
-#include <string>
-#include <typeinfo>
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/phi/core/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-namespace {  // NOLINT
-template <typename K, typename V>
-using KeyValuePair = cub::KeyValuePair<K, V>;
-using Tensor = framework::Tensor;
-
-}  // end namespace
-
-#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
-  case (1 << (log2_block_dim)): {                       \
-    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
-    __VA_ARGS__;                                        \
-  } break
-
-#define FIXED_BLOCK_DIM_CASE(...)               \
-  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
-
-template <typename T, typename IndType, class Reducer, size_t BlockDim>
-__global__ void ArgCUDAKernel(const int64_t height,     // n * h
-                              const int64_t width,      // c
-                              const int64_t post_size,  // h
-                              const Reducer reducer, const T init, const T* in,
-                              IndType* out) {
-  typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
-    KeyValuePair<int, T> kv_pair = {-1, init};
-    int h = idx / post_size;
-    int w = idx % post_size;
-    for (int k = threadIdx.x; k < width; k += blockDim.x) {
-      kv_pair =
-          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
-    }
-    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
-    if (threadIdx.x == 0) {
-      out[idx] = static_cast<IndType>(kv_pair.key);
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T, typename IndType, class Reducer>
-void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
-                    Tensor* indices, const int64_t pre, const int64_t post,
-                    const int64_t n) {
-  auto cu_stream = ctx.stream();
-  auto ComputeBlockSize = [](int64_t col) {
-    auto block_size = 8;
-    if (col > 512)
-      block_size = 1024;
-    else if (col > 256)
-      block_size = 512;
-    else if (col > 128)
-      block_size = 256;
-    else if (col > 64)
-      block_size = 128;
-    else if (col > 32)
-      block_size = 64;
-    else if (col > 16)
-      block_size = 32;
-    else if (col > 8)
-      block_size = 16;
-#ifdef __HIPCC__
-    block_size = std::min(block_size, 256);
-#endif
-    return block_size;
-  };
-
-  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
-  int64_t height = pre * post;
-  int64_t width = n;
-  int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
-
-  const T* in_data = input.data<T>();
-  IndType* out_data = indices->mutable_data<IndType>(ctx.GetPlace());
-
-  if (typeid(Reducer) == typeid(cub::ArgMax)) {
-    switch (ComputeBlockSize(width)) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgCUDAKernel<T, IndType, Reducer,
-                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
-              height, width, post, Reducer(), std::numeric_limits<T>::lowest(),
-              in_data, out_data));
-    }
-  } else {
-    switch (ComputeBlockSize(width)) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgCUDAKernel<T, IndType, Reducer,
-                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
-              height, width, post, Reducer(), std::numeric_limits<T>::max(),
-              in_data, out_data));
-    }
-  }
-}
-
-template <typename T, class Reducer>
-struct VisitDataCudaArgMinMaxFunctor {
-  const framework::ExecutionContext& ctx;
-
-  explicit VisitDataCudaArgMinMaxFunctor(const framework::ExecutionContext& ctx)
-      : ctx(ctx) {}
-  template <typename IndType>
-  void apply() const {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    int axis = ctx.Attr<int64_t>("axis");
-    const bool& flatten = ctx.Attr<bool>("flatten");
-
-    framework::DDim input_dims;
-    if (flatten) {
-      input_dims = phi::make_ddim({input->numel()});
-      // if flatten, the axis just as 0
-      axis = 0;
-    } else {
-      input_dims = input->dims();
-      if (axis < 0) axis += input->dims().size();
-    }
-
-    int64_t numel = input->numel();
-    int64_t groups = numel / input_dims[axis];
-    int64_t pre = 1;
-    int64_t post = 1;
-    int64_t n = input_dims[axis];
-
-    for (int i = 0; i < axis; i++) {
-      pre *= input_dims[i];
-    }
-
-    for (int i = axis + 1; i < input_dims.size(); i++) {
-      post *= input_dims[i];
-    }
-
-    const auto& dev_ctx = ctx.cuda_device_context();
-    ComputeFullArg<T, IndType, Reducer>(dev_ctx, *input, output, pre, post, n);
-  }
-};
-template <typename T, class Reducer>
-class ArgMinMaxOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dtype = ctx.Attr<int>("dtype");
-    if (dtype < 0) {
-      framework::VisitDataTypeTiny(
-          static_cast<framework::proto::VarType::Type>(
-              framework::proto::VarType::INT64),
-          VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
-      return;
-    }
-    framework::VisitDataTypeTiny(
-        static_cast<framework::proto::VarType::Type>(dtype),
-        VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
-  }
-};
-
-#endif
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
index d3ce61d183a3d..585341beea12c 100644
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -27,193 +27,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-enum ArgMinMaxType { kArgMin, kArgMax };
-
-template <typename DeviceContext, typename T, typename Tout, int64_t Rank,
-          ArgMinMaxType argMinMaxValue>
-struct ArgMinMaxFunctor {};
-
-#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value)      \
-  template <typename DeviceContext, typename T, typename Tout, int64_t Rank>  \
-  struct ArgMinMaxFunctor<DeviceContext, T, Tout, Rank,                       \
-                          enum_argminmax_value> {                             \
-    void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \
-                    framework::LoDTensor* out, framework::DDim x_dims,        \
-                    int64_t axis, bool keepdims) {                            \
-      auto in_eigen = framework::EigenTensor<T, Rank>::From(in, x_dims);      \
-      if (keepdims) {                                                         \
-        auto out_eigen = framework::EigenTensor<Tout, Rank>::From(*out);      \
-        out_eigen.device(*(ctx.eigen_device())) =                             \
-            in_eigen.eigen_op_type(axis).template cast<Tout>();               \
-      } else {                                                                \
-        auto out_eigen = framework::EigenTensor<Tout, Rank - 1>::From(*out);  \
-        out_eigen.device(*(ctx.eigen_device())) =                             \
-            in_eigen.eigen_op_type(axis).template cast<Tout>();               \
-      }                                                                       \
-    }                                                                         \
-  }
-
-DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin);
-DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax);
-
-template <typename DeviceContext, typename T, ArgMinMaxType EnumArgMinMaxValue>
-struct VisitDataArgMinMaxFunctor {
-  const framework::ExecutionContext& ctx;
-
-  explicit VisitDataArgMinMaxFunctor(const framework::ExecutionContext& ctx)
-      : ctx(ctx) {}
-  template <typename Tout>
-  void apply() const {
-    auto& x = *(ctx.Input<framework::LoDTensor>("X"));
-    auto& out = *(ctx.Output<framework::LoDTensor>("Out"));
-    out.template mutable_data<Tout>(ctx.GetPlace());
-    auto axis = ctx.Attr<int64_t>("axis");
-    auto keepdims = ctx.Attr<bool>("keepdims");
-    const bool& flatten = ctx.Attr<bool>("flatten");
-    // paddle do not have the scalar tensor, just return the shape [1] tensor
-    if (flatten) keepdims = true;
-
-    // if flatten, will construct the new dims for the cacluate
-    framework::DDim x_dims;
-    if (flatten) {
-      x_dims = phi::make_ddim({x.numel()});
-      // if flatten, the axis just as 0
-      axis = 0;
-    } else {
-      x_dims = x.dims();
-      if (axis < 0) axis += x_dims.size();
-    }
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-#define CALL_ARG_MINMAX_FUNCTOR(rank)                                \
-  ArgMinMaxFunctor<DeviceContext, T, Tout, rank, EnumArgMinMaxValue> \
-      functor##rank;                                                 \
-  functor##rank(dev_ctx, x, &out, x_dims, axis, keepdims)
-
-    switch (x_dims.size()) {
-      case 1:
-        CALL_ARG_MINMAX_FUNCTOR(1);
-        break;
-      case 2:
-        CALL_ARG_MINMAX_FUNCTOR(2);
-        break;
-      case 3:
-        CALL_ARG_MINMAX_FUNCTOR(3);
-        break;
-      case 4:
-        CALL_ARG_MINMAX_FUNCTOR(4);
-        break;
-      case 5:
-        CALL_ARG_MINMAX_FUNCTOR(5);
-        break;
-      case 6:
-        CALL_ARG_MINMAX_FUNCTOR(6);
-        break;
-      default:
-        PADDLE_ENFORCE_LE(
-            x_dims.size(), 6,
-            platform::errors::InvalidArgument(
-                "%s operator doesn't supports tensors whose ranks are greater "
-                "than 6.",
-                (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax")));
-        break;
-#undef CALL_ARG_MINMAX_FUNCTOR
-    }
-  }
-};
-
-template <typename DeviceContext, typename T, ArgMinMaxType EnumArgMinMaxValue>
-class ArgMinMaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dtype = ctx.Attr<int>("dtype");
-    if (dtype < 0) {
-      framework::VisitDataTypeTiny(
-          static_cast<framework::proto::VarType::Type>(
-              framework::proto::VarType::INT64),
-          VisitDataArgMinMaxFunctor<DeviceContext, T, EnumArgMinMaxValue>(ctx));
-      return;
-    }
-    framework::VisitDataTypeTiny(
-        static_cast<framework::proto::VarType::Type>(dtype),
-        VisitDataArgMinMaxFunctor<DeviceContext, T, EnumArgMinMaxValue>(ctx));
-  }
-};
-
-template <typename DeviceContext, typename T>
-using ArgMinKernel = ArgMinMaxKernel<DeviceContext, T, ArgMinMaxType::kArgMin>;
-
-template <typename DeviceContext, typename T>
-using ArgMaxKernel = ArgMinMaxKernel<DeviceContext, T, ArgMinMaxType::kArgMax>;
-
 class ArgMinMaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "arg_min_max");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "arg_min_max");
-    const auto& x_dims = ctx->GetInputDim("X");
-    int64_t axis = ctx->Attrs().Get<int64_t>("axis");
-    bool keepdims = ctx->Attrs().Get<bool>("keepdims");
-    const bool& flatten = ctx->Attrs().Get<bool>("flatten");
-
-    PADDLE_ENFORCE_GE(axis, -x_dims.size(),
-                      platform::errors::InvalidArgument(
-                          "'axis'(%d) must be greater than or equal to"
-                          " -Rank(X)(%d).",
-                          axis, -x_dims.size()));
-    PADDLE_ENFORCE_LT(
-        axis, x_dims.size(),
-        platform::errors::InvalidArgument(
-            "'axis'(%d) must be less than Rank(X)(%d) of Input(X).", axis,
-            x_dims.size()));
-
-    const int& dtype = ctx->Attrs().Get<int>("dtype");
-    PADDLE_ENFORCE_EQ(
-        (dtype < 0 || dtype == 2 || dtype == 3), true,
-        platform::errors::InvalidArgument(
-            "The attribute of dtype in argmin/argmax must be [%s] or [%s], but "
-            "received [%s]",
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64),
-            paddle::framework::DataTypeToString(
-                static_cast<framework::proto::VarType::Type>(dtype))));
-
-    auto x_rank = x_dims.size();
-    if (axis < 0) axis += x_rank;
-    if (ctx->IsRuntime()) {
-      if (dtype == framework::proto::VarType::INT32) {
-        int64_t all_element_num = 0;
-        if (flatten) {
-          all_element_num = phi::product(x_dims);
-
-        } else {
-          all_element_num = x_dims[axis];
-        }
-        PADDLE_ENFORCE_LE(
-            all_element_num, INT_MAX,
-            platform::errors::InvalidArgument(
-                "The element num of the argmin/argmax input at axis is "
-                "%d, is larger than int32 maximum value:%d, you must "
-                "set the dtype of argmin/argmax to 'int64'.",
-                all_element_num, INT_MAX));
-      }
-    }
-    std::vector<int64_t> vec;
-    if (flatten) {
-      vec.emplace_back(static_cast<int64_t>(1));
-    } else {
-      for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
-      if (keepdims) {
-        vec.emplace_back(static_cast<int64_t>(1));
-      }
-      for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]);
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(vec));
-  }
 };
 
 class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc
index 0a4ba6fb0bfdf..fb3abd01af8c3 100644
--- a/paddle/fluid/operators/arg_min_op.cc
+++ b/paddle/fluid/operators/arg_min_op.cc
@@ -12,26 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/arg_min_max_op_base.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+DECLARE_INFER_SHAPE_FUNCTOR(arg_min, ArgMinInferShapeFunctor,
+                            PD_INFER_META(phi::ArgMinMaxInferMeta));
 
 REGISTER_OPERATOR(
     arg_min, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMinOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ArgMinInferShapeFunctor);
 
-REGISTER_OP_CPU_KERNEL(
-    arg_min,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, float>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, double>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    uint8_t>);
 REGISTER_OP_VERSION(arg_min)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/argsort_op.cc b/paddle/fluid/operators/argsort_op.cc
index 9e525c20335d3..1a8aca777370b 100644
--- a/paddle/fluid/operators/argsort_op.cc
+++ b/paddle/fluid/operators/argsort_op.cc
@@ -12,40 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/argsort_op.h"
 #include <memory>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
 class ArgsortOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "argsort");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "argsort");
-    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "argsort");
-
-    auto in_dims = ctx->GetInputDim("X");
-    int axis = ctx->Attrs().Get<int>("axis");
-
-    auto num_dims = in_dims.size();
-    PADDLE_ENFORCE_GE(axis, -num_dims,
-                      platform::errors::InvalidArgument(
-                          "'axis'(%d) must be greater than or equal to"
-                          " -num_dims(%d).",
-                          axis, -num_dims));
-    PADDLE_ENFORCE_LT(
-        axis, num_dims,
-        platform::errors::InvalidArgument(
-            "'axis'(%d) must be less than num_dims(%d).", axis, num_dims));
-
-    ctx->ShareDim("X", "Out");
-    ctx->ShareDim("X", "Indices");
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
 };
 
 class ArgsortGradOp : public framework::OperatorWithKernel {
@@ -122,18 +101,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ArgsortGradNoNeedBufferVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(argsort, ArgsortInferShapeFunctor,
+                            PD_INFER_META(phi::ArgsortInferMeta));
 REGISTER_OPERATOR(argsort, ops::ArgsortOp, ops::ArgsortOpMaker,
                   ops::ArgsortGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ArgsortGradOpMaker<paddle::imperative::OpBase>);
+                  ops::ArgsortGradOpMaker<paddle::imperative::OpBase>,
+                  ArgsortInferShapeFunctor);
 REGISTER_OPERATOR(argsort_grad, ops::ArgsortGradOp,
                   ops::ArgsortGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(argsort,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, float>,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, double>,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, int>,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    argsort_grad, ops::ArgsortGradientKernel<paddle::platform::CPUPlace, float>,
-    ops::ArgsortGradientKernel<paddle::platform::CPUPlace, double>,
-    ops::ArgsortGradientKernel<paddle::platform::CPUPlace, int>,
-    ops::ArgsortGradientKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
deleted file mode 100644
index 8b7a0b3eadb16..0000000000000
--- a/paddle/fluid/operators/argsort_op.cu
+++ /dev/null
@@ -1,430 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/copy.h>
-#include <thrust/execution_policy.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/argsort_op.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-#ifdef __HIPCC__
-namespace rocprim {
-namespace detail {
-template <>
-struct radix_key_codec_base<paddle::platform::float16>
-    : radix_key_codec_integral<paddle::platform::float16, uint16_t> {};
-}  // namespace detail
-}  // namespace rocprim
-#else
-// set cub base traits in order to handle float16
-namespace cub {
-template <>
-struct NumericTraits<paddle::platform::float16>
-    : BaseTraits<FLOATING_POINT, true, false, uint16_t,
-                 paddle::platform::float16> {};
-}  // namespace cub
-#endif
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-// Iter for move to next row
-struct SegmentOffsetIter {
-  EIGEN_DEVICE_FUNC
-  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
-    return idx * num_cols_;
-  }
-
-  int num_cols_;
-};
-
-template <typename T>
-static __global__ void FillIndex(T* indices, T num_rows, T num_cols) {
-  int col_id = threadIdx.x;
-  int row_id = blockIdx.x;
-
-  for (T j = row_id; j < num_rows; j += gridDim.x) {
-    for (T i = col_id; i < num_cols; i += blockDim.x) {
-      indices[j * num_cols + i] = i;
-    }
-  }
-}
-
-template <typename T, typename IndType>
-static __global__ void FillFlattenGrad(const T* dO, const IndType* indices,
-                                       int64_t size, T* dX) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = index; i < size; i += stride) {
-    dX[indices[i]] = dO[i];
-  }
-}
-
-template <typename T, typename IndType>
-static __global__ void FillGrad(const T* dO, const IndType* indices, T* dX,
-                                IndType num_rows, IndType num_cols) {
-  int col_id = threadIdx.x;
-  int row_id = blockIdx.x;
-
-  for (IndType j = row_id; j < num_rows; j += gridDim.x) {
-    for (IndType i = col_id; i < num_cols; i += blockDim.x) {
-      dX[j * num_cols + indices[j * num_cols + i]] = dO[j * num_cols + i];
-    }
-  }
-}
-
-// Sort by flag descending, True: descending. False: Ascending.
-// Default is false.
-template <typename T, typename IndType>
-void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
-                 Tensor* output, Tensor* indices, const IndType num_rows,
-                 const IndType num_cols, const bool descending) {
-  auto cu_stream = ctx.stream();
-
-  Tensor input_indices;
-
-  const std::vector<IndType> dims = {num_rows, num_cols};
-  auto dim = phi::make_ddim(dims);
-  input_indices.Resize(dim);
-  input_indices.mutable_data<IndType>(ctx.GetPlace());
-
-  size_t temp_storage_bytes = -1;
-
-  auto ComputeBlockSize = [](IndType col) {
-    if (col > 512)
-      return 1024;
-    else if (col > 256 && col <= 512)
-      return 512;
-    else if (col > 128 && col <= 256)
-      return 256;
-    else if (col > 64 && col <= 128)
-      return 128;
-    else
-      return 64;
-  };
-
-  int block_size = ComputeBlockSize(num_cols);
-
-  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
-  // actually, int num_rows < max_grid_size
-  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
-  // Init a index array
-  FillIndex<<<grid_size, block_size, 0, cu_stream>>>(
-      input_indices.data<IndType>(), num_rows, num_cols);
-
-  T* sorted_out_ptr;
-  IndType* sorted_indices_ptr;
-
-  const T* inp = input->data<T>();
-  T* out = output->mutable_data<T>(ctx.GetPlace());
-  IndType* ind = indices->mutable_data<IndType>(ctx.GetPlace());
-
-  sorted_out_ptr = out;
-  sorted_indices_ptr = ind;
-
-  // create iter for counting input
-  cub::CountingInputIterator<IndType> counting_iter(0);
-  // segment_offset is used for move to next row
-  cub::TransformInputIterator<IndType, SegmentOffsetIter,
-                              cub::CountingInputIterator<IndType>>
-      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
-
-  gpuError_t err;
-  if (descending) {
-    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-        nullptr, temp_storage_bytes, inp, sorted_out_ptr,
-        input_indices.data<IndType>(), sorted_indices_ptr, num_cols * num_rows,
-        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-        cu_stream);
-  } else {
-    err = cub::DeviceSegmentedRadixSort::SortPairs(
-        nullptr, temp_storage_bytes, inp, sorted_out_ptr,
-        input_indices.data<IndType>(), sorted_indices_ptr, num_cols * num_rows,
-        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-        cu_stream);
-  }
-  PADDLE_ENFORCE_GPU_SUCCESS(err);
-
-  Tensor temp_storage;
-  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
-
-  if (descending) {
-    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-        temp_storage.data<uint8_t>(), temp_storage_bytes, inp, sorted_out_ptr,
-        input_indices.data<IndType>(), sorted_indices_ptr, num_cols * num_rows,
-        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-        cu_stream);
-  } else {
-    err = cub::DeviceSegmentedRadixSort::SortPairs(
-        temp_storage.data<uint8_t>(), temp_storage_bytes, inp, sorted_out_ptr,
-        input_indices.data<IndType>(), sorted_indices_ptr, num_cols * num_rows,
-        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-        cu_stream);
-  }
-
-  PADDLE_ENFORCE_GPU_SUCCESS(err);
-}
-
-template <typename T, typename IndType>
-void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
-                   const Tensor* indices, Tensor* dX, const IndType num_rows,
-                   const IndType num_cols) {
-  auto cu_stream = ctx.stream();
-
-  auto ComputeBlockSize = [](IndType col) {
-    if (col > 512)
-      return 1024;
-    else if (col > 256 && col <= 512)
-      return 512;
-    else if (col > 128 && col <= 256)
-      return 256;
-    else if (col > 64 && col <= 128)
-      return 128;
-    else
-      return 64;
-  };
-
-  int block_size = ComputeBlockSize(num_cols);
-
-  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
-  // actually, int num_rows < max_grid_size
-  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
-  FillGrad<<<grid_size, block_size, 0, cu_stream>>>(
-      dO->data<T>(), indices->data<IndType>(), dX->data<T>(), num_rows,
-      num_cols);
-}
-
-template <typename T>
-void ArgFlattenAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
-                      const Tensor* indices, int64_t size, Tensor* dX) {
-  auto cu_stream = ctx.stream();
-
-  const int64_t block_size =
-      std::min(size, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock()));
-  int64_t max_threads = ctx.GetMaxPhysicalThreadCount();
-  const int64_t max_blocks =
-      std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
-  const int64_t grid_size =
-      std::min(max_blocks, (size + block_size - 1) / block_size);
-
-  FillFlattenGrad<<<grid_size, block_size, 0, cu_stream>>>(
-      dO->data<T>(), indices->data<int64_t>(), size, dX->data<T>());
-}
-
-template <typename DeviceContext, typename T>
-class ArgsortOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
-    int axis = ctx.Attr<int>("axis");
-    bool descending = ctx.Attr<bool>("descending");
-
-    auto in_dims = input->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    const T* in_data = input->data<T>();
-    auto size = input->numel();
-    T* out_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    // Use thrust for parallel acceleration when the input size is equal to the
-    // length of the ‘axis’ dimension.
-    // Compared to the following 'Special case for full sort', ascending sort is
-    // 34 times faster and descending sort is 31 times faster.
-    if (size == in_dims[axis]) {
-      thrust::sequence(thrust::device, ids_data, ids_data + size);
-      thrust::copy(thrust::device, in_data, in_data + size, out_data);
-      thrust::sort_by_key(thrust::device, out_data, out_data + size, ids_data);
-      if (descending) {
-        thrust::reverse(thrust::device, out_data, out_data + size);
-        thrust::reverse(thrust::device, ids_data, ids_data + size);
-      }
-      return;
-    }
-
-    // Special case for full sort, speedup ~190x.
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
-      ArgFullSort<T, int64_t>(dev_ctx, input, output, indices, input_height,
-                              input_width, descending);
-    } else {
-      // if not full sort, do transpose first
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (int i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-
-      Tensor trans_inp;
-      T* trans_inp_data = trans_inp.mutable_data<T>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
-      // Do transpose
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
-                                                   &trans_inp, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-      T* out_data = output->mutable_data<T>(ctx.GetPlace());
-
-      Tensor tmp_indices;
-      // temp indices for sorting
-      tmp_indices.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
-      indices->mutable_data<int64_t>(ctx.GetPlace());
-
-      ArgFullSort<T, int64_t>(dev_ctx, &trans_inp, &tmp_out, &tmp_indices,
-                              input_height, input_width, descending);
-
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, tmp_indices, indices, trans);
-      // transpose back
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, tmp_out,
-                                                   output, trans);
-      return;
-    }
-  }
-};
-
-template <typename T>
-class ArgsortGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    int axis = ctx.Attr<int>("axis");
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    if (dO->numel() == 0) return;
-
-    auto in_dims = dX->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    int64_t size = dX->numel();
-    const auto& dev_ctx = ctx.cuda_device_context();
-
-    // Parallel acceleration when the input size is equal to the length of the
-    // ‘axis’ dimension.
-    // Compared to 'special case for full sort' below, the gradient calculation
-    // is 10 times faster.
-    if (size == in_dims[axis]) {
-      ArgFlattenAssign<T>(dev_ctx, dO, indices, size, dX);
-      return;
-    }
-
-    // Special case for full sort, speedup ~190x.
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-      ArgFullAssign<T, int64_t>(dev_ctx, dO, indices, dX, input_height,
-                                input_width);
-    } else {
-      // if not full sort, do transpose first
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (int i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-
-      Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_dims, ctx.GetPlace());
-      Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      // Do transpose
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *dO,
-                                                   &trans_dO, trans);
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, *indices, &trans_ind, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-
-      ArgFullAssign<T, int64_t>(dev_ctx, &trans_dO, &trans_ind, &tmp_out,
-                                input_height, input_width);
-
-      // transpose back
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, tmp_out, dX,
-                                                   trans);
-      return;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    argsort,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           float>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           double>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           int>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           int64_t>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    argsort_grad, paddle::operators::ArgsortGradOpCUDAKernel<float>,
-    paddle::operators::ArgsortGradOpCUDAKernel<double>,
-    paddle::operators::ArgsortGradOpCUDAKernel<int>,
-    paddle::operators::ArgsortGradOpCUDAKernel<int64_t>,
-    paddle::operators::ArgsortGradOpCUDAKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/argsort_op.h b/paddle/fluid/operators/argsort_op.h
deleted file mode 100644
index d850e51a4bf06..0000000000000
--- a/paddle/fluid/operators/argsort_op.h
+++ /dev/null
@@ -1,243 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/transpose_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-using Tensor = framework::Tensor;
-
-template <typename T, typename Type>
-static void FullSort(Type input_height, Type input_width, int input_dim,
-                     const framework::Tensor* input, T* t_out, Type* t_indices,
-                     bool descending) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    std::vector<std::pair<T, Type>> col_vec;
-    col_vec.reserve(input_width);
-    if (input_dim == 1) {
-      auto e_input = EigenVector<T>::Flatten(*input);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.push_back(std::pair<T, Type>(e_input(j), j));
-      }
-    } else {
-      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.push_back(std::pair<T, Type>(e_input(i, j), j));
-      }
-    }
-    std::sort(col_vec.begin(), col_vec.end(),
-              [&](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-                if (descending)
-                  return l.first > r.first;
-                else
-                  return l.first < r.first;
-              });
-
-    for (Type j = 0; j < input_width; ++j) {
-      t_out[i * input_width + j] = col_vec[j].first;
-      t_indices[i * input_width + j] = col_vec[j].second;
-    }
-  }
-}
-
-template <typename T, typename Type>
-static void FullAssign(Type input_height, Type input_width, int input_dim,
-                       const framework::Tensor* input,
-                       const framework::Tensor* indices, T* t_out) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    if (input_dim == 1) {
-      auto e_input = EigenVector<T>::Flatten(*input);
-      auto e_indices = EigenVector<Type>::Flatten(*indices);
-      for (Type j = 0; j < input_width; ++j) {
-        t_out[i * input_width + e_indices(j)] = e_input(j);
-      }
-    } else {
-      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        t_out[i * input_width + e_indices(i, j)] = e_input(i, j);
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class ArgsortKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* output = ctx.Output<framework::Tensor>("Out");
-    auto* indices = ctx.Output<framework::Tensor>("Indices");
-    int axis = ctx.Attr<int>("axis");
-    bool descending = ctx.Attr<bool>("descending");
-
-    auto in_dims = input->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    T* out_data = output->mutable_data<T>(ctx.GetPlace());
-
-    // Do full sort
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-
-      int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-      FullSort<T, int64_t>(input_height, input_width, in_dims.size(), input,
-                           out_data, ids_data, descending);
-    } else {
-      // If not full sort do transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-
-      Tensor trans_inp;
-      trans_inp.mutable_data<T>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-      // Do transpose
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_ctx, *input,
-                                                  &trans_inp, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-      output->mutable_data<T>(ctx.GetPlace());
-
-      Tensor tmp_indices;
-
-      auto* t_ind =
-          tmp_indices.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
-
-      FullSort<T, int64_t>(input_height, input_width, in_dims.size(),
-                           &trans_inp, t_out, t_ind, descending);
-
-      indices->mutable_data<int64_t>(ctx.GetPlace());
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_ctx, tmp_indices, indices, trans);
-      // transpose back
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_ctx, tmp_out,
-                                                  output, trans);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ArgsortGradientKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    int axis = ctx.Attr<int>("axis");
-
-    auto in_dims = indices->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-
-    // Do full assign
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-
-      FullAssign<T, int64_t>(input_height, input_width, in_dims.size(), dO,
-                             indices, dX->data<T>());
-    } else {
-      // If not full assign do transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-
-      Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_dims, ctx.GetPlace());
-      Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-      // Do transpose
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_ctx, *dO,
-                                                  &trans_dO, trans);
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_ctx, *indices, &trans_ind, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-
-      FullAssign<T, int64_t>(input_height, input_width, in_dims.size(),
-                             &trans_dO, &trans_ind, t_out);
-
-      // transpose back
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_ctx, tmp_out, dX,
-                                                  trans);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc
index 077be715bece0..c927eec00bc8b 100644
--- a/paddle/fluid/operators/argsort_op_npu.cc
+++ b/paddle/fluid/operators/argsort_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/argsort_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/argsort_op_xpu.cc b/paddle/fluid/operators/argsort_op_xpu.cc
index 18e81936a16c6..359b00fcf87ee 100644
--- a/paddle/fluid/operators/argsort_op_xpu.cc
+++ b/paddle/fluid/operators/argsort_op_xpu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/argsort_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc
index 684ac5bafd0ef..ea6614cbfbdf8 100644
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -16,6 +16,9 @@ limitations under the License. */
 
 #include <string>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -36,26 +39,6 @@ class AssignOp : public framework::OperatorWithKernel {
            const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    if (ctx->HasInput("X")) {
-      auto type = ctx->GetInputsVarType("X")[0];
-      if (type == framework::proto::VarType::SELECTED_ROWS ||
-          type == framework::proto::VarType::LOD_TENSOR) {
-        ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-        if (type == framework::proto::VarType::LOD_TENSOR) {
-          ctx->ShareLoD("X", /*->*/ "Out");
-        }
-      } else if (type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
-        if (ctx->IsRuntime()) {
-          // The runtime output shape is determined in kernel.
-          return;
-        } else {
-          ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-        }
-      }
-    }
-  }
-
  protected:
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name, const framework::Tensor &tensor,
@@ -91,24 +74,6 @@ class AssignInferVarType : public framework::VarTypeInference {
   }
 };
 
-class AssignKernel {
- public:
-  void operator()(const framework::ExecutionContext &ctx) const {
-    auto *x = ctx.InputVar("X");
-    if (x == nullptr) {
-      return;
-    }
-    PADDLE_ENFORCE_EQ(
-        ctx.HasOutput("Out"), true,
-        platform::errors::NotFound("Output(Out) of assign_op is not found."));
-    auto *out = ctx.OutputVar("Out");
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(ctx.GetPlace());
-
-    framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
-  }
-};
-
 class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -147,23 +112,11 @@ DECLARE_INPLACE_OP_INFERER(AssignOpInplaceInferer, {"X", "Out"});
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+
+DECLARE_INFER_SHAPE_FUNCTOR(assign, AssignInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(assign, ops::AssignOp,
                   ops::AssignGradMaker<paddle::framework::OpDesc>,
                   ops::AssignGradMaker<paddle::imperative::OpBase>,
                   ops::AssignOpProtoMaker, ops::AssignOpInplaceInferer,
-                  ops::AssignInferVarType);
-
-REGISTER_OP_CPU_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double,
-                               ops::AssignKernel, int, ops::AssignKernel,
-                               int64_t, ops::AssignKernel, uint8_t,
-                               ops::AssignKernel, bool, ops::AssignKernel,
-                               plat::float16, ops::AssignKernel, plat::bfloat16,
-                               ops::AssignKernel);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double,
-                                ops::AssignKernel, int, ops::AssignKernel,
-                                int64_t, ops::AssignKernel, uint8_t,
-                                ops::AssignKernel, bool, ops::AssignKernel,
-                                plat::float16, ops::AssignKernel);
-#endif
+                  ops::AssignInferVarType, AssignInferShapeFunctor);
diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc
index 72488a932d9c3..b91eb50646fec 100644
--- a/paddle/fluid/operators/assign_op_npu_test.cc
+++ b/paddle/fluid/operators/assign_op_npu_test.cc
@@ -23,14 +23,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(assign);
+USE_OP_ITSELF(assign);
 USE_OP_DEVICE_KERNEL(assign, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 174207deb08b8..5194c8772e47b 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -21,6 +21,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -297,184 +300,6 @@ The required data format for this layer is one of the following:
 )DOC");
 }
 
-template <typename T>
-class BatchNormKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
-    bool test_mode = is_test && (!trainable_stats);
-
-    bool global_stats = test_mode || use_global_stats;
-
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_GE(
-        x_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "The size of input X's dimensions should be larger than 1."
-            "But received: the size of input X's dimensions is [%d]",
-            x_dims.size()));
-    PADDLE_ENFORCE_LE(
-        x_dims.size(), 5,
-        platform::errors::InvalidArgument(
-            "The size of input X's dimensions should be less than 6."
-            "But received: the size of input X's dimensionss is [%d]",
-            x_dims.size()));
-    const int N = x_dims[0];
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int sample_size = x->numel() / N / C;
-
-    auto *y = ctx.Output<Tensor>("Y");
-
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-
-    // alloc memory
-    y->mutable_data<T>(ctx.GetPlace());
-    mean_out->mutable_data<T>(ctx.GetPlace());
-    variance_out->mutable_data<T>(ctx.GetPlace());
-    saved_mean->mutable_data<T>(ctx.GetPlace());
-    saved_variance->mutable_data<T>(ctx.GetPlace());
-
-    // input dimension is 2 and the format is NCHW. The input can be regarded
-    // as NHWC format
-    if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
-      data_layout = DataLayout::kNHWC;
-    }
-
-    if (!global_stats) {
-      // saved_xx is use just in this batch of data
-      EigenVectorArrayMap<T> saved_mean_e(
-          saved_mean->mutable_data<T>(ctx.GetPlace()), C);
-      EigenVectorArrayMap<T> saved_variance_e(
-          saved_variance->mutable_data<T>(ctx.GetPlace()), C);
-      saved_mean_e.setZero();
-      saved_variance_e.setZero();
-
-      EigenVectorArrayMap<T> running_mean_arr(
-          mean_out->mutable_data<T>(ctx.GetPlace()), C);
-      EigenVectorArrayMap<T> running_var_arr(
-          variance_out->mutable_data<T>(ctx.GetPlace()), C);
-
-      if ((N * sample_size) == 1) {
-        // Only 1 element in normalization dimension,
-        // we skip the batch norm calculation, let y = x.
-        framework::TensorCopy(*x, ctx.GetPlace(), y);
-        return;
-      }
-
-      switch (data_layout) {
-        case DataLayout::kNCHW: {
-          ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
-          for (int nc = 0; nc < N * C; ++nc) {
-            saved_mean_e(nc % C) += x_arr.col(nc).sum();
-          }
-          saved_mean_e /= N * sample_size;
-          for (int nc = 0; nc < N * C; ++nc) {
-            saved_variance_e(nc % C) +=
-                (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
-          }
-          saved_variance_e /= N * sample_size;
-          break;
-        }
-        case DataLayout::kNHWC: {
-          ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
-          for (int i = 0; i < N * sample_size; ++i) {
-            saved_mean_e += x_arr.col(i);
-          }
-          saved_mean_e /= N * sample_size;
-          for (int i = 0; i < N * sample_size; ++i) {
-            saved_variance_e +=
-                (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e);
-          }
-          saved_variance_e /= N * sample_size;
-          break;
-        }
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Unknown storage order: %s", data_layout_str));
-      }
-
-      // if MomentumTensor is set, use MomentumTensor value, momentum
-      // is only used in this training branch
-      if (ctx.HasInput("MomentumTensor")) {
-        const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
-        momentum = mom_tensor->data<float>()[0];
-      }
-
-      running_mean_arr =
-          running_mean_arr * momentum + saved_mean_e * (1. - momentum);
-      running_var_arr =
-          running_var_arr * momentum + saved_variance_e * (1. - momentum);
-    }
-
-    // use SavedMean and SavedVariance to do normalize
-    Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
-    if (global_stats) {
-      ConstEigenVectorArrayMap<T> var_arr(
-          ctx.Input<Tensor>("Variance")->data<T>(), C);
-      inv_std = (var_arr + epsilon).sqrt().inverse();
-    } else {
-      EigenVectorArrayMap<T> saved_inv_std(
-          ctx.Output<Tensor>("SavedVariance")->data<T>(), C);
-      // inverse SavedVariance first, gradient will use it too.
-      saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt();
-      inv_std = saved_inv_std;
-    }
-    ConstEigenVectorArrayMap<T> mean_arr(
-        global_stats ? ctx.Input<Tensor>("Mean")->data<T>()
-                     : ctx.Output<Tensor>("SavedMean")->data<T>(),
-        C);
-
-    //   ((x - est_mean) * (inv_var) * scale + bias
-    //   formula transform ====>
-    //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
-    ConstEigenVectorArrayMap<T> bias_arr(bias->data<T>(), C);
-    Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
-    Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
-        bias_arr - mean_arr * inv_std * scale_arr;
-
-    switch (data_layout) {
-      case DataLayout::kNCHW: {
-        EigenArrayMap<T> y_arr(y->mutable_data<T>(ctx.GetPlace()), sample_size,
-                               N * C);
-        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
-        for (int nc = 0; nc < N * C; ++nc) {
-          y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
-        }
-        break;
-      }
-      case DataLayout::kNHWC: {
-        EigenArrayMap<T>(y->mutable_data<T>(ctx.GetPlace()), C,
-                         N * sample_size) =
-            (ConstEigenArrayMap<T>(x->data<T>(), C, N * sample_size).colwise() *
-             new_scale)
-                .colwise() +
-            new_bias;
-        break;
-      }
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Unknown storage order: %d", data_layout));
-    }
-  }
-};
-
 void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
   // check input
   OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "BatchNormGrad");
@@ -585,261 +410,6 @@ framework::OpKernelType BatchNormGradOp::GetKernelTypeForVar(
                                  tensor.place(), tensor.layout());
 }
 
-template <typename T>
-class BatchNormGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    // SavedVariance have been reverted in forward operator
-    const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const float epsilon = ctx.Attr<float>("epsilon");
-    DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    use_global_stats = is_test || use_global_stats;
-
-    // batch_norm with inplace as false will take X as grad input, which
-    // is same as cuDNN batch_norm backward calculation, batch_norm
-    // with inplace as true only take Y as input and X should be calculate
-    // by inverse operation of batch_norm on Y
-    const Tensor *x;
-    bool is_inplace;
-    if (ctx.HasInput("Y")) {
-      x = ctx.Input<Tensor>("Y");
-      is_inplace = true;
-      // if the input of batch norm is stop_gradient, d_x is null.
-      if (d_x) {
-        PADDLE_ENFORCE_EQ(d_x, d_y,
-                          platform::errors::InvalidArgument(
-                              "X@GRAD and Y@GRAD not inplace in inplace mode"));
-      }
-    } else {
-      x = ctx.Input<Tensor>("X");
-      is_inplace = false;
-      if (d_x) {
-        PADDLE_ENFORCE_NE(
-            d_x, d_y, platform::errors::InvalidArgument(
-                          "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
-      }
-    }
-
-    // Get the size for each dimension.
-    // NCHW [batch_size, in_channels, in_height, in_width]
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_GE(
-        x_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "The size of input X's dimensions should be larger than 1."
-            "But received: the size of input X's dimensions is [%d]",
-            x_dims.size()));
-    PADDLE_ENFORCE_LE(
-        x_dims.size(), 5,
-        platform::errors::InvalidArgument(
-            "The size of input X's dimensions should be less than 6."
-            "But received: the size of input X's dimensions is [%d]",
-            x_dims.size()));
-    const int N = x_dims[0];
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int sample_size = x->numel() / N / C;
-
-    // input dimension is 2 and the format is NCHW. The input can be regarded as
-    // NHWC format
-    if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
-      data_layout = DataLayout::kNHWC;
-    }
-
-    // init output
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-    }
-
-    const T *mean_data = saved_mean->data<T>();
-    const T *inv_var_data = saved_inv_variance->data<T>();
-    Tensor inv_var_tensor;
-    if (use_global_stats) {
-      const auto *running_mean = ctx.Input<Tensor>("Mean");
-      const auto *running_variance = ctx.Input<Tensor>("Variance");
-      mean_data = running_mean->data<T>();
-      inv_var_tensor.Resize({C});
-      T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
-      EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
-      ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
-
-      inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
-      inv_var_data = running_inv_var_data;
-    }
-
-    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
-    ConstEigenVectorArrayMap<T> bias_arr(bias->data<T>(), C);
-    ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
-    ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
-
-    T *d_bias_data = nullptr;
-    T *d_scale_data = nullptr;
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      d_bias_data = d_bias->mutable_data<T>(ctx.GetPlace());
-      d_scale_data = d_scale->mutable_data<T>(ctx.GetPlace());
-    }
-
-    // d_bias = np.sum(d_y, axis=0)
-    // d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
-    // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
-    //   - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
-    EigenVectorArrayMap<T> d_bias_arr(d_bias_data, C);
-    EigenVectorArrayMap<T> d_scale_arr(d_scale_data, C);
-
-    if (d_scale && d_bias) {
-      d_bias_arr.setZero();
-      d_scale_arr.setZero();
-    }
-
-    if (d_x && (N * sample_size) == 1 && !use_global_stats) {
-      framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
-      return;
-    }
-
-    int scale_coefff = use_global_stats ? 1 : N * sample_size;
-    const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff;
-
-    Tensor dy_sum;
-    dy_sum.Resize({C});
-    dy_sum.mutable_data<T>(ctx.GetPlace());
-    EigenVectorArrayMap<T> dy_sum_arr(dy_sum.mutable_data<T>(ctx.GetPlace()),
-                                      C);
-
-    Tensor dy_mul_x_sub_mean_mul_invstd_sum;
-    dy_mul_x_sub_mean_mul_invstd_sum.Resize({C});
-    dy_mul_x_sub_mean_mul_invstd_sum.mutable_data<T>(ctx.GetPlace());
-    EigenVectorArrayMap<T> dy_mul_x_sub_mean_mul_invstd_sum_arr(
-        dy_mul_x_sub_mean_mul_invstd_sum.mutable_data<T>(ctx.GetPlace()), C);
-
-    dy_sum_arr.setZero();
-    dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero();
-
-    // inplace calculation
-    // Y:  ((x - est_mean) * (inv_var) * scale + bias
-    //   formula transform ====>
-    //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
-    // X: (y - bias) / scale / (inv_var) + est_mean
-    //   formula transform ====>
-    //    (y - bias) / (scale * inv_var) + est_mean
-    switch (data_layout) {
-      case DataLayout::kNCHW: {
-        if (is_inplace) {
-          auto px = *x;
-          EigenArrayMap<T> x_data(px.mutable_data<T>(ctx.GetPlace()),
-                                  sample_size, N * C);
-          ConstEigenArrayMap<T> y_data(x->data<T>(), sample_size, N * C);
-          for (int nc = 0; nc < N * C; ++nc) {
-            x_data.col(nc) = (y_data.col(nc) - bias_arr(nc % C)) /
-                                 scale_inv_var_nhw(nc % C) / scale_coefff +
-                             mean_arr(nc % C);
-          }
-        }
-        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
-        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
-
-        for (int nc = 0; nc < N * C; ++nc) {
-          int c = nc % C;
-          dy_sum_arr(c) += d_y_arr.col(nc).sum();
-          dy_mul_x_sub_mean_mul_invstd_sum_arr(c) +=
-              ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
-                  .sum();
-        }
-
-        if (d_scale && d_bias) {
-          d_bias_arr = dy_sum_arr;
-          d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
-        }
-
-        if (d_x) {
-          EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()),
-                                   sample_size, N * C);
-          if (!use_global_stats) {
-            for (int nc = 0; nc < N * C; ++nc) {
-              int c = nc % C;
-              d_x_arr.col(nc) =
-                  scale_inv_var_nhw(c) *
-                  (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) -
-                   (x_arr.col(nc) - mean_arr[c]) *
-                       dy_mul_x_sub_mean_mul_invstd_sum_arr(c) *
-                       inv_var_arr(c));
-            }
-          } else {
-            for (int nc = 0; nc < N * C; ++nc) {
-              int c = nc % C;
-              d_x_arr.col(nc) = scale_inv_var_nhw(c) * d_y_arr.col(nc);
-            }
-          }
-        }
-        break;
-      }
-      case DataLayout::kNHWC: {
-        if (is_inplace) {
-          auto px = *x;
-          EigenArrayMap<T> x_data(px.mutable_data<T>(ctx.GetPlace()), C,
-                                  N * sample_size);
-          ConstEigenArrayMap<T> y_data(x->data<T>(), C, N * sample_size);
-          for (int nhw = 0; nhw < N * sample_size; nhw++) {
-            x_data.col(nhw) = (y_data.col(nhw) - bias_arr) / scale_inv_var_nhw /
-                                  scale_coefff +
-                              mean_arr;
-          }
-        }
-        ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
-        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
-
-        for (int nhw = 0; nhw < N * sample_size; ++nhw) {
-          dy_sum_arr += d_y_arr.col(nhw);
-          dy_mul_x_sub_mean_mul_invstd_sum_arr +=
-              (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
-        }
-
-        if (d_scale && d_bias) {
-          d_bias_arr = dy_sum_arr;
-          d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
-        }
-
-        if (d_x) {
-          EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()), C,
-                                   N * sample_size);
-          if (!use_global_stats) {
-            for (int nhw = 0; nhw < N * sample_size; ++nhw) {
-              d_x_arr.col(nhw) =
-                  scale_inv_var_nhw *
-                  (d_y_arr.col(nhw) * N * sample_size - dy_sum_arr -
-                   (x_arr.col(nhw) - mean_arr) *
-                       dy_mul_x_sub_mean_mul_invstd_sum_arr * inv_var_arr);
-            }
-          } else {
-            for (int nhw = 0; nhw < N * sample_size; ++nhw) {
-              d_x_arr.col(nhw) = scale_inv_var_nhw * d_y_arr.col(nhw);
-            }
-          }
-        }
-        break;
-      }
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Unknown storage order: %s", data_layout_str));
-    }
-  }
-};
-
 template <typename T>
 void BatchNormGradMaker<T>::Apply(GradOpPtr<T> op) const {
   op->SetType(this->ForwardOpType() + "_grad");
@@ -951,335 +521,16 @@ framework::OpKernelType BatchNormDoubleGradOp::GetExpectedKernelType(
       OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
 }
 
-template <typename T>
-class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *X = ctx.Input<Tensor>("X");
-    const auto *Scale = ctx.Input<Tensor>("Scale");
-    const auto *dY = ctx.Input<Tensor>("DY");
-    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
-    const float epsilon = ctx.Attr<float>("epsilon");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool is_test = ctx.Attr<bool>("is_test");
-
-    PADDLE_ENFORCE_EQ(
-        is_test, false,
-        platform::errors::InvalidArgument(
-            "`is_test = True` CANNOT be used in train program. If "
-            "you want to use global status in pre_train model, "
-            "please set `use_global_stats = True`"));
-
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-
-    const auto *ddX = ctx.Input<Tensor>("DDX");
-    const auto *ddScale = ctx.Input<Tensor>("DDScale");
-    const auto *ddBias = ctx.Input<Tensor>("DDBias");
-
-    auto *dX = ctx.Output<Tensor>("DX");
-    auto *dScale = ctx.Output<Tensor>("DScale");
-    auto *ddY = ctx.Output<Tensor>("DDY");
-    dX->mutable_data<T>(ctx.GetPlace());
-    ddY->mutable_data<T>(ctx.GetPlace());
-
-    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-
-    const auto &x_dims = X->dims();
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int sample_size = X->numel() / C;
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
-
-    const T *mean_data = Saved_mean->data<T>();
-    const T *inv_var_data = Saved_variance->data<T>();
-
-    Tensor inv_var_tensor;
-    if (use_global_stats) {
-      const auto *running_mean = ctx.Input<Tensor>("Mean");
-      const auto *running_variance = ctx.Input<Tensor>("Variance");
-      mean_data = running_mean->data<T>();
-      inv_var_tensor.Resize({C});
-
-      T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
-      EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
-      ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
-
-      inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
-      inv_var_data = running_inv_var_data;
-    }
-
-    // transpose NCHW -> NHWC for easy calculate
-    Tensor transformed_x(X->type());
-    Tensor transformed_dy(dY->type());
-    Tensor transformed_ddx(ddX->type());
-
-    Tensor transformed_dx(dX->type());
-    Tensor transformed_ddy(ddY->type());
-    if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) {
-      VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
-      // Input Tensor
-      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, X,
-                                                         &transformed_x);
-      TransToChannelLast<platform::CPUDeviceContext, T>(ctx, X, &transformed_x);
-      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, dY,
-                                                         &transformed_dy);
-      TransToChannelLast<platform::CPUDeviceContext, T>(ctx, dY,
-                                                        &transformed_dy);
-      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, ddX,
-                                                         &transformed_ddx);
-      TransToChannelLast<platform::CPUDeviceContext, T>(ctx, ddX,
-                                                        &transformed_ddx);
-      // Output Tensor
-      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, dX,
-                                                         &transformed_dx);
-      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, ddY,
-                                                         &transformed_ddy);
-    } else {
-      transformed_x.ShareDataWith(*X);
-      transformed_dy.ShareDataWith(*dY);
-      transformed_ddx.ShareDataWith(*ddX);
-
-      transformed_dx.ShareDataWith(*dX);
-      transformed_ddy.ShareDataWith(*ddY);
-    }
-
-    ConstEigenArrayMap<T> x_arr(transformed_x.data<T>(), C, sample_size);
-    ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
-    ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
-
-    Tensor mean_tile;
-    mean_tile.Resize({C, sample_size});
-    mean_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> mean_tile_data(mean_tile.mutable_data<T>(ctx.GetPlace()),
-                                    C, sample_size);
-
-    Tensor inv_var_tile;
-    inv_var_tile.Resize({C, sample_size});
-    inv_var_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> inv_var_tile_data(
-        inv_var_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-
-    mean_tile_data = mean_arr.replicate(1, sample_size);
-    inv_var_tile_data = inv_var_arr.replicate(1, sample_size);
-
-    Tensor Scale_data;
-    if (!Scale) {
-      Scale_data.mutable_data<T>({C}, ctx.GetPlace());
-      set_constant(dev_ctx, &Scale_data, static_cast<T>(1));
-    }
-    ConstEigenVectorArrayMap<T> scale_arr(
-        Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
-
-    Tensor scale_tile;
-    scale_tile.Resize({C, sample_size});
-    scale_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> scale_tile_data(scale_tile.mutable_data<T>(ctx.GetPlace()),
-                                     C, sample_size);
-    scale_tile_data = scale_arr.replicate(1, sample_size);
-
-    ConstEigenArrayMap<T> dy_arr(transformed_dy.data<T>(), C, sample_size);
-    ConstEigenArrayMap<T> ddx_arr(transformed_ddx.data<T>(), C, sample_size);
-
-    Tensor x_sub_mean_mul_invstd;
-    x_sub_mean_mul_invstd.Resize({C, sample_size});
-    x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
-        x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-    x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
-
-    if (dX) {
-      dX->mutable_data<T>(ctx.GetPlace());
-      EigenArrayMap<T> dx_arr(transformed_dx.mutable_data<T>(ctx.GetPlace()), C,
-                              sample_size);
-      dx_arr.setZero();
-      if (use_global_stats) {
-        // math: dx = (ddscale * dy) * inv_var
-        if (ddScale) {
-          ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-          Tensor ddscale_tile;
-          ddscale_tile.Resize({C, sample_size});
-          EigenArrayMap<T> ddscale_tile_data(
-              ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-          ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
-
-          dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data;
-        }
-      } else {
-        // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx,
-        // axis=(n,h,w)) *
-        //          np.sum(dy, axis=(n,h,w)) -
-        //          np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x -
-        //          mean),
-        //          axis=(n,h,w)) * inv_var.pow(2) *
-        //          np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) /
-        //          NxHxW *
-        //          np.sum(ddx * (x - mean)) *
-        //          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
-        //          np.sum(dy,
-        //          axis=(n,h,w)) * (x - mean) *
-        //          (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
-        //          inv_var
-        //          *
-        //          np.mean(dy, axis=(n,h,w)) -
-        //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
-        //          axis=(n,h,w)))
-
-        if (ddX) {
-          dx_arr +=
-              (x_sub_mean_mul_invstd_arr * inv_var_tile_data *
-               inv_var_tile_data / sample_size)
-                  .colwise() *
-              (ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size -
-               (dy_arr * ddx_arr).rowwise().sum() +
-               3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() *
-                   (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
-                   sample_size);
-
-          dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
-                    (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
-                    sample_size *
-                    (dy_arr.rowwise().sum() / sample_size - dy_arr);
-
-          dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
-                    (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
-                    sample_size *
-                    (ddx_arr.rowwise().sum() / sample_size - ddx_arr);
-
-          dx_arr = scale_tile_data * dx_arr;
-        }
-        if (ddScale) {
-          ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-          Tensor ddscale_tile;
-          ddscale_tile.Resize({C, sample_size});
-          EigenArrayMap<T> ddscale_tile_data(
-              ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-          ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
-
-          dx_arr += (dy_arr * inv_var_tile_data -
-                     (dy_arr.rowwise().sum().replicate(1, sample_size) /
-                      sample_size) *
-                         inv_var_tile_data -
-                     x_sub_mean_mul_invstd_arr * inv_var_tile_data *
-                         (dy_arr * x_sub_mean_mul_invstd_arr)
-                             .rowwise()
-                             .sum()
-                             .replicate(1, sample_size) /
-                         sample_size) *
-                    ddscale_tile_data;
-        }
-      }
-      if (data_layout == DataLayout::kNCHW) {
-        VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
-        TransToChannelFirst<paddle::platform::CPUDeviceContext, T>(
-            ctx, &transformed_dx, dX);
-      }
-    }
-    if (dScale) {
-      dScale->mutable_data<T>(ctx.GetPlace());
-      EigenVectorArrayMap<T> dscale_arr(dScale->mutable_data<T>(ctx.GetPlace()),
-                                        C);
-      dscale_arr.setZero();
-      if (use_global_stats) {
-        // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
-        if (ddX) {
-          dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum();
-        }
-      } else {
-        // math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) *
-        //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
-        //            ddx
-        if (ddX) {
-          Tensor first_grad;
-          first_grad.Resize({C, sample_size});
-          EigenArrayMap<T> first_grad_arr(
-              first_grad.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-          first_grad_arr.setZero();
-
-          first_grad_arr +=
-              inv_var_tile_data *
-              (dy_arr -
-               dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
-               x_sub_mean_mul_invstd_arr *
-                   (dy_arr * x_sub_mean_mul_invstd_arr)
-                       .rowwise()
-                       .sum()
-                       .replicate(1, sample_size) /
-                   sample_size);
-          dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum();
-        }
-      }
-    }
-
-    if (ddY) {
-      ddY->mutable_data<T>(ctx.GetPlace());
-      EigenArrayMap<T> ddy_arr(transformed_ddy.mutable_data<T>(ctx.GetPlace()),
-                               C, sample_size);
-      ddy_arr.setZero();
-      if (use_global_stats) {
-        // math: ddy = r * ddx * inv_var + ddbias +
-        //           ddscale * (x - mean) * inv_var
-        if (ddX) {
-          ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data;
-        }
-      } else {
-        // math: ddy = (x - mean) * inv_var * ddscale + ddbias +
-        //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
-        //           np.mean(ddx * (x - mean), axis=(n,h,w)))
-        if (ddX) {
-          ddy_arr +=
-              scale_tile_data * inv_var_tile_data *
-              (ddx_arr -
-               ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
-               x_sub_mean_mul_invstd_arr *
-                   (ddx_arr * x_sub_mean_mul_invstd_arr)
-                       .rowwise()
-                       .sum()
-                       .replicate(1, sample_size) /
-                   sample_size);
-        }
-      }
-      if (ddScale) {
-        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-        Tensor ddscale_tile;
-        ddscale_tile.Resize({C, sample_size});
-        EigenArrayMap<T> ddscale_tile_data(
-            ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-        ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
-
-        ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
-      }
-
-      if (ddBias) {
-        ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
-        Tensor ddbias_tile;
-        ddbias_tile.Resize({C, sample_size});
-        EigenArrayMap<T> ddbias_tile_data(
-            ddbias_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-        ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
-
-        ddy_arr += ddbias_tile_data;
-      }
-
-      if (data_layout == DataLayout::kNCHW) {
-        VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
-        TransToChannelFirst<paddle::platform::CPUDeviceContext, T>(
-            ctx, &transformed_ddy, ddY);
-      }
-    }
-  }
-};
-
 DECLARE_INPLACE_OP_INFERER(BatchNormDoubleGradOpInplaceInferer, {"DY", "DDY"});
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(batch_norm, BatchNormInferShapeFunctor,
+                            PD_INFER_META(phi::BatchNormInferMeta));
+
 REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
                   ops::BatchNormOpInferVarType,
                   ops::BatchNormGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
index f8d37d685b929..d274e8d2c006d 100644
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -113,23 +113,5 @@ class BatchNormOpInferVarType
   }
 };
 
-template <typename DeviceContext, typename T>
-class BatchNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class BatchNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class BatchNormDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/batch_norm_op_mlu.cc b/paddle/fluid/operators/batch_norm_op_mlu.cc
index 0e64b461786cc..6507890a8b5dc 100644
--- a/paddle/fluid/operators/batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/batch_norm_op_mlu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
@@ -20,6 +21,8 @@ namespace operators {
 
 template <typename T>
 class MLUBatchNormOpKernel : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto &place = ctx.GetPlace();
@@ -68,10 +71,10 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
 
     // alloc memory
     y->mutable_data<T>(place);
-    mean_out->mutable_data<T>(place);
-    variance_out->mutable_data<T>(place);
-    saved_mean->mutable_data<T>(place);
-    saved_variance->mutable_data<T>(place);
+    mean_out->mutable_data<MPDType>(place);
+    variance_out->mutable_data<MPDType>(place);
+    saved_mean->mutable_data<MPDType>(place);
+    saved_variance->mutable_data<MPDType>(place);
 
     Tensor transformed_x;
     Tensor transformed_y;
@@ -132,6 +135,8 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
 
 template <typename T>
 class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *x = ctx.Input<Tensor>("X");
@@ -154,10 +159,10 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
     auto &dev_ctx = ctx.template device_context<MLUDeviceContext>();
     auto d_x_tmp =
         ctx.AllocateTmpTensor<T, MLUDeviceContext>(x->dims(), dev_ctx);
-    auto scale_grad_tmp =
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>(scale->dims(), dev_ctx);
+    auto scale_grad_tmp = ctx.AllocateTmpTensor<MPDType, MLUDeviceContext>(
+        scale->dims(), dev_ctx);
     auto bias_grad_tmp =
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>(bias->dims(), dev_ctx);
+        ctx.AllocateTmpTensor<MPDType, MLUDeviceContext>(bias->dims(), dev_ctx);
 
     if (d_x == nullptr) {
       d_x = &d_x_tmp;
@@ -171,8 +176,8 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
 
     const auto &place = ctx.GetPlace();
     d_x->mutable_data<T>(place);
-    d_scale->mutable_data<T>(place);
-    d_bias->mutable_data<T>(place);
+    d_scale->mutable_data<MPDType>(place);
+    d_bias->mutable_data<MPDType>(place);
 
     use_global_stats = is_test || use_global_stats;
 
diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc
index a70b6e991161d..ae03ecbcb16a0 100644
--- a/paddle/fluid/operators/batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/batch_norm_op_npu.cc
@@ -76,10 +76,10 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> {
       auto *variance_out = ctx.Output<Tensor>("VarianceOut");
       auto *saved_mean = ctx.Output<Tensor>("SavedMean");
       auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-      mean_out->mutable_data<T>(ctx.GetPlace());
-      variance_out->mutable_data<T>(ctx.GetPlace());
-      saved_mean->mutable_data<T>(ctx.GetPlace());
-      saved_variance->mutable_data<T>(ctx.GetPlace());
+      mean_out->mutable_data<float>(ctx.GetPlace());
+      variance_out->mutable_data<float>(ctx.GetPlace());
+      saved_mean->mutable_data<float>(ctx.GetPlace());
+      saved_variance->mutable_data<float>(ctx.GetPlace());
 
       // if MomentumTensor is set, use MomentumTensor value, momentum
       // is only used in this training branch
@@ -170,8 +170,8 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel<T> {
 
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
     if (d_scale && d_bias) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      d_bias->mutable_data<T>(ctx.GetPlace());
+      d_scale->mutable_data<float>(ctx.GetPlace());
+      d_bias->mutable_data<float>(ctx.GetPlace());
       if (use_global_stats) {
         const auto *running_mean = ctx.Input<Tensor>("Mean");
         const auto *running_variance = ctx.Input<Tensor>("Variance");
diff --git a/paddle/fluid/operators/cholesky_solve_op.cc b/paddle/fluid/operators/cholesky_solve_op.cc
index 6b5bae8fc73fe..5403e2440ee58 100644
--- a/paddle/fluid/operators/cholesky_solve_op.cc
+++ b/paddle/fluid/operators/cholesky_solve_op.cc
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/cholesky_solve_op.h"
-#include "paddle/fluid/operators/solve_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -39,50 +40,6 @@ class CholeskySolveOpMaker : public framework::OpProtoAndCheckerMaker {
 class CholeskySolveOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *context) const override {
-    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "CholeskySolve");
-    OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", "CholeskySolve");
-    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "CholeskySolve");
-    auto u_dims = context->GetInputDim("Y");
-    auto b_dims = context->GetInputDim("X");
-    int u_rank = u_dims.size();
-    int b_rank = b_dims.size();
-    PADDLE_ENFORCE_GE(u_rank, 2,
-                      platform::errors::InvalidArgument(
-                          "the rank of input Y must greater or equal to 2"));
-    PADDLE_ENFORCE_GE(b_rank, 2,
-                      platform::errors::InvalidArgument(
-                          "the rank of input X must greater or equal to 2"));
-    PADDLE_ENFORCE_EQ(u_dims[u_rank - 1], u_dims[u_rank - 2],
-                      platform::errors::InvalidArgument(
-                          "input Matrix Y should be square matrix,"
-                          "But Got last shape of %ld x %ld",
-                          u_dims[u_rank - 1], u_dims[u_rank - 2]));
-    PADDLE_ENFORCE_EQ(
-        b_dims[b_rank - 2], u_dims[u_rank - 2],
-        platform::errors::InvalidArgument(
-            "the first dim of input X must equal to the dim of input Y,"
-            "But Got %ld and %ld",
-            b_dims[b_rank - 2], u_dims[u_rank - 2]));
-
-    std::vector<int64_t> u_dims_vec = phi::vectorize(u_dims);
-    std::vector<int64_t> b_dims_vec = phi::vectorize(b_dims);
-
-    std::vector<int64_t> u_dims_vec_cut(u_dims_vec.begin(),
-                                        u_dims_vec.end() - 2);
-    std::vector<int64_t> b_dims_vec_cut(b_dims_vec.begin(),
-                                        b_dims_vec.end() - 2);
-
-    std::vector<int64_t> expand_batch_portion =
-        get_broadcast_batch_portion(u_dims_vec_cut, b_dims_vec_cut);
-
-    std::vector<int64_t> b_broadcast_dims({expand_batch_portion});
-    b_broadcast_dims.insert(b_broadcast_dims.end(),
-                            {b_dims_vec[b_rank - 2], b_dims_vec[b_rank - 1]});
-
-    // dim of 'Out' is the same with 'Y' after broadcast
-    context->SetOutputDim("Out", phi::make_ddim(b_broadcast_dims));
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -151,22 +108,15 @@ class CholeskySolveGradOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(cholesky_solve, CholeskySolveInferShapeFunctor,
+                            PD_INFER_META(phi::CholeskySolveInferMeta));
+
 REGISTER_OPERATOR(cholesky_solve, ops::CholeskySolveOp,
                   ops::CholeskySolveOpMaker,
                   ops::CholeskySolveOpVarTypeInference,
                   ops::CholeskySolveOpGradMaker<paddle::framework::OpDesc>,
-                  ops::CholeskySolveOpGradMaker<paddle::imperative::OpBase>);
+                  ops::CholeskySolveOpGradMaker<paddle::imperative::OpBase>,
+                  CholeskySolveInferShapeFunctor);
 
 REGISTER_OPERATOR(cholesky_solve_grad, ops::CholeskySolveGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    cholesky_solve,
-    ops::CholeskySolveKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CholeskySolveKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    cholesky_solve_grad,
-    ops::CholeskySolveGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CholeskySolveGradKernel<paddle::platform::CPUDeviceContext, double>);
-// Complex<> is not supported because of TensorExpand, which used to boardcast
-// input Tensor
diff --git a/paddle/fluid/operators/cholesky_solve_op.cu b/paddle/fluid/operators/cholesky_solve_op.cu
deleted file mode 100644
index 1b551a7cd0343..0000000000000
--- a/paddle/fluid/operators/cholesky_solve_op.cu
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_WITH_HIP
-// HIP not support cusolver
-
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/cholesky_solve_op.h"
-#include "paddle/fluid/platform/dynload/cusolver.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;
-
-template <typename T>
-void cusolver_potrs(const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo,
-                    int n, int nrhs, T *Adata, int lda, T *Bdata, int ldb,
-                    int *devInfo);
-
-template <>
-void cusolver_potrs<float>(const cusolverDnHandle_t &cusolverH,
-                           cublasFillMode_t uplo, int n, int nrhs, float *Adata,
-                           int lda, float *Bdata, int ldb, int *devInfo) {
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSpotrs(
-      cusolverH, uplo, n, nrhs, Adata, lda, Bdata, ldb, devInfo));
-}
-
-template <>
-void cusolver_potrs<double>(const cusolverDnHandle_t &cusolverH,
-                            cublasFillMode_t uplo, int n, int nrhs,
-                            double *Adata, int lda, double *Bdata, int ldb,
-                            int *devInfo) {
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDpotrs(
-      cusolverH, uplo, n, nrhs, Adata, lda, Bdata, ldb, devInfo));
-}
-
-template <>
-void cusolver_potrs<platform::complex<float>>(
-    const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, int n, int nrhs,
-    platform::complex<float> *Adata, int lda, platform::complex<float> *Bdata,
-    int ldb, int *devInfo) {
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnCpotrs(
-      cusolverH, uplo, n, nrhs, reinterpret_cast<const cuComplex *>(Adata), lda,
-      reinterpret_cast<cuComplex *>(Bdata), ldb, devInfo));
-}
-
-template <>
-void cusolver_potrs<platform::complex<double>>(
-    const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, int n, int nrhs,
-    platform::complex<double> *Adata, int lda, platform::complex<double> *Bdata,
-    int ldb, int *devInfo) {
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnZpotrs(
-      cusolverH, uplo, n, nrhs,
-      reinterpret_cast<const cuDoubleComplex *>(Adata), lda,
-      reinterpret_cast<cuDoubleComplex *>(Bdata), ldb, devInfo));
-}
-
-template <typename T>
-class CholeskySolveFunctor<paddle::platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext &dev_ctx, bool upper, int n,
-                  int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) {
-    cublasFillMode_t uplo =
-        upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
-
-    /* step 1: get cusolver handle*/
-    auto cusolverH = dev_ctx.cusolver_dn_handle();
-
-    /* step 2: solve A0*X0 = B0  */
-    cusolver_potrs<T>(cusolverH, uplo, n, nrhs, Adata, lda, Bdata, lda,
-                      devInfo);
-
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
-  }
-};
-
-template <typename T>
-class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const Tensor &in, Tensor *out,
-                  const framework::ExecutionContext &ctx) {
-    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
-    // out_reduce_dim should be [0, 2]
-    const std::vector<std::int64_t> in_dims = phi::vectorize(in.dims());
-    auto in_size = in_dims.size();
-    const std::vector<std::int64_t> out_dims = phi::vectorize(out->dims());
-    auto out_size = out_dims.size();
-
-    std::vector<std::int64_t> out_bst_dims(in_size);
-
-    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
-    std::copy(out_dims.data(), out_dims.data() + out_size,
-              out_bst_dims.data() + in_size - out_size);
-
-    std::vector<int> out_reduce_dims;
-    for (size_t idx = 0; idx <= in_size - 3; idx++) {
-      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
-        out_reduce_dims.push_back(idx);
-      }
-    }
-    gpuStream_t stream = ctx.cuda_device_context().stream();
-    TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
-        out_reduce_dims, stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    cholesky_solve,
-    ops::CholeskySolveKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CholeskySolveKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    cholesky_solve_grad,
-    ops::CholeskySolveGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CholeskySolveGradKernel<paddle::platform::CUDADeviceContext, double>);
-
-#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h
deleted file mode 100644
index 74b961d4e55e8..0000000000000
--- a/paddle/fluid/operators/cholesky_solve_op.h
+++ /dev/null
@@ -1,252 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/solve_op.h"
-#include "paddle/fluid/operators/triangular_solve_op.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
-#include "paddle/phi/kernels/math_kernel.h"
-#include "paddle/phi/kernels/transpose_kernel.h"
-
-namespace paddle {
-namespace operators {  // namespace operators
-
-template <typename DeviceContext, typename T>
-class CholeskySolveFunctor {
- public:
-  void operator()(const platform::DeviceContext &dev_ctx, bool upper, int n,
-                  int nrhs, T *Adata, int lda, T *Bdata, int *devInfo);
-};
-
-template <typename T>
-class CholeskySolveFunctor<paddle::platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext &dev_ctx, bool upper, int n,
-                  int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) {
-    char uplo = upper ? 'U' : 'L';
-    phi::funcs::lapackCholeskySolve<T>(uplo, n, nrhs, Adata, lda, Bdata, lda,
-                                       devInfo);
-  }
-};
-
-template <typename DeviceContext, typename T>
-void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx,
-                       const framework::Tensor &uin,
-                       const framework::Tensor &bin, framework::Tensor *out,
-                       bool upper) {
-  const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-  // framework::Tensor broadcast
-  std::vector<int64_t> u_bst_dims_vec;
-  std::vector<int64_t> b_bst_dims_vec;
-  std::tie(u_bst_dims_vec, b_bst_dims_vec) = get_broadcast_dims(uin, bin);
-  framework::Tensor u_bst(uin.type());
-  TensorExpand<T, DeviceContext>(dev_ctx, uin, &u_bst, u_bst_dims_vec);
-
-  framework::Tensor b_bst(bin.type());
-  TensorExpand<T, DeviceContext>(dev_ctx, bin, &b_bst, b_bst_dims_vec);
-
-  auto &phi_dev_ctx = static_cast<
-      const typename framework::ConvertToPhiContext<DeviceContext>::TYPE &>(
-      dev_ctx);
-
-  // calculate u's conjugate for complex
-  framework::Tensor u_conj(u_bst.type());
-  platform::ForRange<DeviceContext> u_for_range(dev_ctx, u_bst.numel());
-  phi::funcs::ConjFunctor<T> u_functor(
-      u_bst.data<T>(), u_bst.numel(),
-      u_conj.mutable_data<T>(u_bst.dims(), dev_ctx.GetPlace()));
-  u_for_range(u_functor);
-  u_conj = phi::TransposeLast2Dim<T>(phi_dev_ctx, u_conj);
-
-  // calculate b's conjugate for complex
-  framework::Tensor b_conj(b_bst.type());
-  platform::ForRange<DeviceContext> b_for_range(dev_ctx, b_bst.numel());
-  phi::funcs::ConjFunctor<T> b_functor(
-      b_bst.data<T>(), b_bst.numel(),
-      b_conj.mutable_data<T>(b_bst.dims(), dev_ctx.GetPlace()));
-  b_for_range(b_functor);
-  b_conj = phi::TransposeLast2Dim<T>(phi_dev_ctx, b_conj);
-
-  auto ut_data = u_conj.mutable_data<T>(dev_ctx.GetPlace());
-  auto uindims = u_bst.dims();
-  auto bindims = b_bst.dims();
-  int uinrank = uindims.size();
-  int binrank = bindims.size();
-
-  int n = uindims[uinrank - 2];
-  int nrhs = bindims[binrank - 1];
-  int ldab = std::max(1, n);
-
-  // framework::Tensor out_copy(b_conj.type());
-  // out_copy.Resize(b_conj.dims());
-  framework::TensorCopy(b_conj, dev_ctx.GetPlace(), out);
-  T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-
-  auto info_dims = phi::slice_ddim(bindims, 0, binrank - 2);
-  auto batchsize = product(info_dims);
-
-  framework::Tensor tmp;
-  std::vector<int> tmpdim(1, batchsize);
-  tmp.Resize(phi::make_ddim(tmpdim));
-  int *info = tmp.mutable_data<int>(dev_ctx.GetPlace());
-
-  CholeskySolveFunctor<DeviceContext, T> functor;
-  for (int b = 0; b < batchsize; b++) {
-    auto uin_data_item = &ut_data[b * n * n];
-    auto out_data_item = &out_data[b * n * nrhs];
-    auto info_item = &info[b];
-    functor(dev_ctx, upper, n, nrhs, uin_data_item, ldab, out_data_item,
-            info_item);
-  }
-
-  // calculate out's conjugate for complex
-  platform::ForRange<DeviceContext> out_for_range(dev_ctx, out->numel());
-  phi::funcs::ConjFunctor<T> out_functor(
-      out->data<T>(), out->numel(),
-      out->mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
-  out_for_range(out_functor);
-  *out = phi::TransposeLast2Dim<T>(phi_dev_ctx, *out);
-}
-
-template <typename DeviceContext, typename T>
-class CholeskySolveKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
-    auto *uin = ctx.Input<framework::Tensor>("Y");
-    auto *bin = ctx.Input<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::Tensor>("Out");
-    auto upper = ctx.Attr<bool>("upper");
-    cholesky_solve_fn<DeviceContext, T>(ctx, *uin, *bin, out, upper);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CholeskySolveGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *bin = ctx.Input<framework::Tensor>("X");
-    auto *uin = ctx.Input<framework::Tensor>("Y");
-    auto *out = ctx.Input<framework::Tensor>("Out");
-    auto *dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *db = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto *du = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    auto upper = ctx.Attr<bool>("upper");
-
-    const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    auto &phi_dev_ctx = static_cast<
-        const typename framework::ConvertToPhiContext<DeviceContext>::TYPE &>(
-        dev_ctx);
-
-    std::vector<int64_t> u_bst_dims_vec;
-    std::vector<int64_t> b_bst_dims_vec;
-    std::tie(u_bst_dims_vec, b_bst_dims_vec) = get_broadcast_dims(*uin, *bin);
-    framework::Tensor u_bst(uin->type());
-    TensorExpand<T, DeviceContext>(dev_ctx, *uin, &u_bst, u_bst_dims_vec);
-
-    framework::Tensor db_bst(bin->type());
-    TensorExpand<T, DeviceContext>(dev_ctx, *bin, &db_bst, b_bst_dims_vec);
-
-    if (dout) {
-      db->mutable_data<T>(dev_ctx.GetPlace());
-      cholesky_solve_fn<DeviceContext, T>(ctx, u_bst, *dout, &db_bst, upper);
-
-      if (db_bst.dims() == db->dims()) {
-        framework::TensorCopy(db_bst, dev_ctx.GetPlace(), dev_ctx, db);
-      } else {
-        MatrixReduceSumFunctor<DeviceContext, T> functor;
-        functor(db_bst, db, ctx);
-        db->Resize(bin->dims());
-      }
-
-      auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-
-      // calculate out's conjugate for complex
-      framework::Tensor out_conj(out->type());
-      platform::ForRange<DeviceContext> out_for_range(dev_ctx, out->numel());
-      phi::funcs::ConjFunctor<T> out_functor(
-          out->data<T>(), out->numel(),
-          out_conj.mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
-      out_for_range(out_functor);
-      out_conj = phi::TransposeLast2Dim<T>(phi_dev_ctx, out_conj);
-
-      framework::Tensor commonterm(out->type());
-      auto outdims = out_conj.dims();
-      auto dbdims = db_bst.dims();
-      auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(outdims, 0, false);
-      auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(dbdims, 0, false);
-      auto cmtdim = outdims;
-      cmtdim[cmtdim.size() - 2] = dbdims[dbdims.size() - 2];
-      commonterm.Resize(cmtdim);
-      commonterm.mutable_data<T>(dev_ctx.GetPlace());
-      blas.MatMul(db_bst, mat_dim_b, out_conj, mat_dim_a, static_cast<T>(1),
-                  &commonterm, static_cast<T>(0));
-
-      // calculate commonterm's conjugate for complex
-      framework::Tensor commonterm_conj(commonterm.type());
-      platform::ForRange<DeviceContext> commonterm_for_range(
-          dev_ctx, commonterm.numel());
-      phi::funcs::ConjFunctor<T> commonterm_functor(
-          commonterm.data<T>(), commonterm.numel(),
-          commonterm_conj.mutable_data<T>(commonterm.dims(),
-                                          dev_ctx.GetPlace()));
-      commonterm_for_range(commonterm_functor);
-      commonterm_conj = phi::TransposeLast2Dim<T>(phi_dev_ctx, commonterm_conj);
-
-      phi::AddRawKernel<T>(
-          static_cast<const typename paddle::framework::ConvertToPhiContext<
-              DeviceContext>::TYPE &>(dev_ctx),
-          commonterm, commonterm_conj, -1, &commonterm);
-
-      auto mat_dim_u =
-          phi::funcs::CreateMatrixDescriptor(u_bst.dims(), 0, false);
-      auto mat_dim_c =
-          phi::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false);
-
-      Tensor du_bst(uin->type());
-      // get upper or lower triangular
-      du_bst.Resize(u_bst.dims());
-      du_bst.mutable_data<T>(dev_ctx.GetPlace());
-      if (upper) {
-        blas.MatMul(u_bst, mat_dim_u, commonterm, mat_dim_c, static_cast<T>(-1),
-                    &du_bst, static_cast<T>(0));
-      } else {
-        blas.MatMul(commonterm, mat_dim_c, u_bst, mat_dim_u, static_cast<T>(-1),
-                    &du_bst, static_cast<T>(0));
-      }
-
-      const auto &udims = u_bst.dims();
-      const auto H = udims[udims.size() - 2];
-      const auto W = udims[udims.size() - 1];
-      platform::ForRange<DeviceContext> x_for_range(dev_ctx, u_bst.numel());
-      TrilTriuCompute<T> tril_triu_computer(du_bst.data<T>(), 0, !upper, H, W,
-                                            u_bst.data<T>());
-      x_for_range(tril_triu_computer);
-
-      du->mutable_data<T>(dev_ctx.GetPlace());
-      if (u_bst.dims() == du->dims()) {
-        framework::TensorCopy(u_bst, dev_ctx.GetPlace(), dev_ctx, du);
-      } else {
-        MatrixReduceSumFunctor<DeviceContext, T> functor;
-        functor(u_bst, du, ctx);
-        du->Resize(uin->dims());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
index 2afee35112e6f..0edbee534c0b5 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
@@ -22,11 +22,17 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP(cinn_launch);
 USE_OP(cinn_instruction_run);
 USE_OP_ITSELF(elementwise_add);
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+#ifdef PADDLE_WITH_CUDA
+PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+#endif
+
 namespace paddle::operators {
 
 using framework::paddle2cinn::CinnCompiler;
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
index 460d417e61fd4..585f1caabed05 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -26,12 +26,18 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP(cinn_launch);
 USE_OP(cinn_instruction_run);
 USE_OP_ITSELF(elementwise_add);
 DECLARE_double(eager_delete_tensor_gb);
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+#ifdef PADDLE_WITH_CUDA
+PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+#endif
+
 namespace paddle::operators {
 
 using framework::paddle2cinn::CinnCompiler;
diff --git a/paddle/fluid/operators/collective/c_allgather_op_mlu.cc b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
new file mode 100644
index 0000000000000..f29bc57c9a5f4
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+
+#if defined(PADDLE_WITH_CNCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
+#endif
+#include "paddle/fluid/framework/convert_utils.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_CNCL)
+    auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    cnclDataType_t dtype =
+        platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype()));
+
+    int nranks = ctx.Attr<int>("nranks");
+    int rid = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
+    PADDLE_ENFORCE_EQ(
+        nranks, comm->nranks(),
+        platform::errors::InvalidArgument("nranks: %s should equal to %s",
+                                          nranks, comm->nranks()));
+
+    framework::DDim out_dims = x->dims();
+    out_dims[0] *= nranks;
+    out->mutable_data<T>(out_dims, place);
+
+    uint32_t send_numel = x->numel();
+    void* send_buff = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
+    void* recv_buff = reinterpret_cast<void*>(out->data<T>());
+
+    mluStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(send_buff, recv_buff, send_numel,
+                                             dtype, comm->comm(), stream));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with MLU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(c_allgather, ops::CAllGatherOpMLUKernel<float>,
+                       ops::CAllGatherOpMLUKernel<uint8_t>,
+                       ops::CAllGatherOpMLUKernel<int>,
+                       ops::CAllGatherOpMLUKernel<int8_t>,
+                       ops::CAllGatherOpMLUKernel<int16_t>,
+                       ops::CAllGatherOpMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
index c0968581acda9..7206dd01bcaa3 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
index 31b00a93f1396..0946ad8aca65e 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 7e5120cd2b392..2c4e85400ca4a 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -413,7 +413,7 @@ class CAllReduceOpMLUKernel : public framework::OpKernel<T> {
 
     auto place = ctx.GetPlace();
     cnclDataType_t dtype =
-        platform::ToCNCLDataType(framework::TransToProtoVarType(in->type()));
+        platform::ToCNCLDataType(framework::TransToProtoVarType(in->dtype()));
     int64_t numel = in->numel();
     const void* sendbuff = in->data<T>();
     out->Resize(in->dims());
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
index 9c11704704ed4..61e5f27903477 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
index d315f211709e4..d1e269fb5a4fe 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
@@ -31,7 +31,7 @@ class CBroadcastOPMLUKernel : public framework::OpKernel<T> {
     auto out = ctx.Output<framework::LoDTensor>("Out");
     int numel = x->numel();
     cnclDataType_t dtype =
-        platform::ToCNCLDataType(framework::TransToProtoVarType(x->type()));
+        platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype()));
 
     int rid = ctx.Attr<int>("ring_id");
     auto place = ctx.GetPlace();
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
index 5787090e6a52f..cf4d6a28744b3 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
index c79b2f92b69a1..c4e410d04da5f 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
index d9a7a4abb08fc..8b498787c69db 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
index b8abf458c1c6d..133085ad3f3b0 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
index bb78971734bf0..36c6f4fadd0fc 100644
--- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc
+++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
@@ -27,7 +27,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
index 8f7b8c4a9040b..6e02d36215697 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
index c40b2c3e76a02..57e3dd53cc774 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/controlflow/compare_op_mlu.cc b/paddle/fluid/operators/controlflow/compare_op_mlu.cc
index 9dc287ab76a67..c39743ef9914c 100644
--- a/paddle/fluid/operators/controlflow/compare_op_mlu.cc
+++ b/paddle/fluid/operators/controlflow/compare_op_mlu.cc
@@ -11,7 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index bc29c92b09426..8a190c1a1e091 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -40,6 +40,13 @@ class FeedVariableVisitor : public boost::static_visitor<void> {
         out_var_->GetMutable<framework::LoDTensor>();
     if (platform::is_same_place(in_tensor.place(), place_)) {
       out_tensor->ShareDataWith(in_tensor);
+#ifdef PADDLE_WITH_IPU
+    } else if (platform::is_ipu_place(place_)) {
+      // For ipu, both in_tensor and out_tensor are allocated on cpu,
+      // PopART will copy tensor from host automatically,
+      // no TensorCopy() is required here.
+      out_tensor->ShareDataWith(in_tensor);
+#endif
     } else {
       platform::DeviceContext *context =
           platform::DeviceContextPool::Instance().Get(place_);
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 8213e877f7224..9be63a85fc0de 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -27,6 +27,9 @@ limitations under the License. */
 #endif
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -841,6 +844,8 @@ framework::OpKernelType ConvOpDoubleGrad::GetExpectedKernelType(
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(conv2d, Conv2dInferShapeFunctor,
+                            PD_INFER_META(phi::ConvInferMeta));
 REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
                   ops::ConvOpInferVarType,
                   ops::Conv2DGradMaker<paddle::framework::OpDesc>,
@@ -851,6 +856,8 @@ REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad,
 REGISTER_OPERATOR(conv2d_grad_grad, ops::ConvOpDoubleGrad);
 
 // depthwise convolution op
+DECLARE_INFER_SHAPE_FUNCTOR(depthwise_conv2d, DepthwiseConv2dInferShapeFunctor,
+                            PD_INFER_META(phi::ConvInferMeta));
 REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
                   ops::ConvOpInferVarType,
                   ops::Conv2DGradMaker<paddle::framework::OpDesc>,
@@ -860,6 +867,8 @@ REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad,
                   ops::Conv2DDoubleGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(depthwise_conv2d_grad_grad, ops::ConvOpDoubleGrad);
 
+DECLARE_INFER_SHAPE_FUNCTOR(conv3d, Conv3dInferShapeFunctor,
+                            PD_INFER_META(phi::ConvInferMeta));
 REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker,
                   ops::ConvOpInferVarType,
                   ops::Conv3DGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc
index 8897f7b229c32..fcda16a3e72ac 100644
--- a/paddle/fluid/operators/conv_op_npu.cc
+++ b/paddle/fluid/operators/conv_op_npu.cc
@@ -356,7 +356,7 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> {
 
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
     if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
+      filter_grad->mutable_data<float>(ctx.GetPlace());
       std::vector<int> filter_shape_vec = phi::vectorize<int>(filter->dims());
 
       const auto& runner = NpuOpRunner(
diff --git a/paddle/fluid/operators/conv_op_xpu.cc b/paddle/fluid/operators/conv_op_xpu.cc
index ddfc6fe862c27..e4751f1f26008 100644
--- a/paddle/fluid/operators/conv_op_xpu.cc
+++ b/paddle/fluid/operators/conv_op_xpu.cc
@@ -19,14 +19,16 @@ namespace operators {
 
 template <typename DeviceContext, typename T>
 class GemmConvXPUKernel : public framework::OpKernel<T> {
+  using XPUT = typename XPUTypeTrait<T>::Type;
+
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
+  void Compute(const framework::ExecutionContext &context) const override {
+    const Tensor *input = context.Input<Tensor>("Input");
     // The filter will be reshaped in the calculations,
     // so here use an assignment operation,
     // that avoids modifying the variable in the Scope.
     Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
+    Tensor *output = context.Output<Tensor>("Output");
     output->mutable_data<T>(context.GetPlace());
     int groups = context.Attr<int>("groups");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
@@ -53,11 +55,16 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
     const int img_h = static_cast<int>(input->dims()[2]);
     const int img_w = static_cast<int>(input->dims()[3]);
     const int f = static_cast<int>(filter.dims()[0]);
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    int r = xpu::conv2d<float, float, float, int16_t>(
-        dev_ctx.x_context(), input->data<float>(), filter.data<float>(),
-        output->data<float>(), batch_size, img_c, img_h, img_w, f, ksize,
-        strides, paddings, dilations, groups, nullptr, nullptr, nullptr, true);
+
+    const XPUT *input_data = reinterpret_cast<const XPUT *>(input->data<T>());
+    const XPUT *filter_data = reinterpret_cast<const XPUT *>(filter.data<T>());
+    XPUT *output_data = reinterpret_cast<XPUT *>(output->data<T>());
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    int r = xpu::conv2d<XPUT, XPUT, XPUT, int16_t>(
+        dev_ctx.x_context(), input_data, filter_data, output_data, batch_size,
+        img_c, img_h, img_w, f, ksize, strides, paddings, dilations, groups,
+        nullptr, nullptr, nullptr, true);
     PADDLE_ENFORCE_EQ(
         r, XPU_SUCCESS,
         platform::errors::External("XPU conv kernel return wrong value[%d %s]",
@@ -67,14 +74,16 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
 
 template <typename DeviceContext, typename T>
 class GemmConvGradXPUKernel : public framework::OpKernel<T> {
+  using XPUT = typename XPUTypeTrait<T>::Type;
+
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
+  void Compute(const framework::ExecutionContext &context) const override {
+    const Tensor *input = context.Input<Tensor>("Input");
+    const Tensor *output_grad =
         context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
+    Tensor *input_grad =
         context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
+    Tensor *filter_grad =
         context.Output<Tensor>(framework::GradVarName("Filter"));
     // The filter and filter_grad will be reshaped in the calculations,
     // so here use an assignment operation,
@@ -107,19 +116,27 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
     const int img_h = static_cast<int>(input->dims()[2]);
     const int img_w = static_cast<int>(input->dims()[3]);
     const int f = static_cast<int>(filter.dims()[0]);
+
+    const XPUT *input_data = reinterpret_cast<const XPUT *>(input->data<T>());
+    const XPUT *filter_data = reinterpret_cast<const XPUT *>(filter.data<T>());
+    const XPUT *output_grad_data =
+        reinterpret_cast<const XPUT *>(output_grad->data<T>());
+    XPUT *input_grad_data = nullptr;
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
+      input_grad_data = reinterpret_cast<XPUT *>(input_grad->data<T>());
     }
+    XPUT *filter_grad_data = nullptr;
     if (filter_grad) {
       filter_grad->mutable_data<T>(context.GetPlace());
+      filter_grad_data = reinterpret_cast<XPUT *>(filter_grad->data<T>());
     }
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    int r = xpu::conv2d_grad<float, float, float, int16_t>(
-        dev_ctx.x_context(), input->data<T>(), filter.data<T>(),
-        output_grad->data<T>(), input_grad ? input_grad->data<T>() : nullptr,
-        filter_grad ? filter_grad->data<T>() : nullptr, batch_size, img_c,
-        img_h, img_w, f, ksize, strides, paddings, dilations, groups, nullptr,
-        nullptr, nullptr, nullptr, nullptr, true);
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    int r = xpu::conv2d_grad<XPUT, XPUT, XPUT, int16_t>(
+        dev_ctx.x_context(), input_data, filter_data, output_grad_data,
+        input_grad_data, filter_grad_data, batch_size, img_c, img_h, img_w, f,
+        ksize, strides, paddings, dilations, groups, nullptr, nullptr, nullptr,
+        nullptr, nullptr, true);
     PADDLE_ENFORCE_EQ(
         r, XPU_SUCCESS,
         platform::errors::External("XPU conv kernel return wrong value[%d %s]",
@@ -130,14 +147,22 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP_XPU_KERNEL(
-    depthwise_conv2d,
-    ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext, float>);
-REGISTER_OP_XPU_KERNEL(
-    conv2d, ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext, float>);
+    conv2d, ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext,
+                           paddle::platform::float16>);
 REGISTER_OP_XPU_KERNEL(
     conv2d_grad,
-    ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+    ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext,
+                               paddle::platform::float16>);
+REGISTER_OP_XPU_KERNEL(
+    depthwise_conv2d,
+    ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext,
+                           paddle::platform::float16>);
 REGISTER_OP_XPU_KERNEL(
     depthwise_conv2d_grad,
-    ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+    ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext,
+                               paddle::platform::float16>);
 #endif
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
deleted file mode 100644
index 1841b78af32dd..0000000000000
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ /dev/null
@@ -1,1286 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memory.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/operators/conv_miopen_helper.h"
-#else
-#include "paddle/fluid/operators/conv_cudnn_helper.h"
-#endif
-#include "paddle/fluid/operators/conv_transpose_op.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/padding.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T, int D>
-static void DataTranspose(const framework::ExecutionContext& ctx,
-                          const Tensor* input, Tensor* output,
-                          const std::vector<int>& axis, int flag = 0) {
-  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  phi::funcs::Transpose<platform::CUDADeviceContext, T, D> transpose;
-  auto in_dims = input->dims();
-  std::vector<int64_t> input_transpose_vec;
-  for (size_t i = 0; i < axis.size(); ++i) {
-    if (flag == 0)
-      input_transpose_vec.push_back(in_dims[axis[i]]);
-    else
-      input_transpose_vec.push_back(in_dims[i]);
-  }
-  framework::DDim input_transpose_dims(phi::make_ddim(input_transpose_vec));
-  output->mutable_data<T>(input_transpose_dims, ctx.GetPlace());
-  transpose(dev_ctx, *input, output, axis);
-}
-
-template <typename T>
-class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* filter = ctx.Input<Tensor>("Filter");
-    auto* output = ctx.Output<Tensor>("Output");
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-
-    // cudnn v5 does not support dilations
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    const T* filter_data = filter->data<T>();
-    const std::string data_layout_str = ctx.Attr<std::string>("data_format");
-    const paddle::platform::DataLayout data_layout =
-        (data_layout_str != "NHWC" ? platform::DataLayout::kNCHW
-                                   : platform::DataLayout::kNHWC);
-
-    // if channel_last, transpose to channel_first
-    Tensor input_transpose;
-    std::vector<int> input_vec = phi::vectorize<int>(input->dims());
-    std::vector<int> output_vec = phi::vectorize<int>(output->dims());
-    if (data_layout == platform::DataLayout::kNHWC) {
-      if (strides.size() == 2U) {
-        std::vector<int> axis = {0, 3, 1, 2};
-        for (size_t i = 0; i < axis.size(); ++i) {
-          input_vec[i] = input->dims()[axis[i]];
-          output_vec[i] = output->dims()[axis[i]];
-        }
-        DataTranspose<T, 4>(ctx, input, &input_transpose, axis);
-      } else if (strides.size() == 3U) {
-        std::vector<int> axis = {0, 4, 1, 2, 3};
-        for (size_t i = 0; i < axis.size(); ++i) {
-          input_vec[i] = input->dims()[axis[i]];
-          output_vec[i] = output->dims()[axis[i]];
-        }
-        DataTranspose<T, 5>(ctx, input, &input_transpose, axis);
-      }
-    } else {
-      input_transpose = *input;
-    }
-
-    // update padding and dilation
-    auto in_dims = input_transpose.dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
-
-    std::vector<int> input_pad(input_transpose.dims().size() * 2, 0);
-    Tensor transformed_input;
-    std::vector<int> padding_common(data_dim, 0);
-    if (!is_sys_pad) {
-      std::vector<int> padding_diff(data_dim);
-      std::vector<int> new_input_shape_vec(data_dim + 2);
-      new_input_shape_vec[0] = input_transpose.dims()[0];
-      new_input_shape_vec[1] = input_transpose.dims()[1];
-
-      for (size_t i = 0; i < data_dim; ++i) {
-        padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
-        padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
-        new_input_shape_vec[i + 2] =
-            input_transpose.dims()[i + 2] + padding_diff[i];
-        input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
-        input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
-      }
-      framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
-      transformed_input.Resize(new_input_shape);
-      auto& dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-
-      transformed_input =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_input_shape, dev_ctx);
-      const int rank = input_transpose.dims().size();
-      T pad_value(0.0);
-      switch (rank) {
-        case 4: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              dev_ctx, input_pad, input_transpose, pad_value,
-              &transformed_input);
-        } break;
-        case 5: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              dev_ctx, input_pad, input_transpose, pad_value,
-              &transformed_input);
-        } break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Op(ConvTranspose) only supports 4-D or 5-D input Tensor."));
-      }
-    } else {
-      transformed_input = input_transpose;
-      if (paddings.size() == data_dim) {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[i];
-        }
-      } else {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[2 * i];
-        }
-      }
-    }
-
-    std::vector<int64_t> starts(data_dim, 0);
-    std::vector<int64_t> ends(data_dim, 0);
-    std::vector<int64_t> axes(data_dim, 0);
-    for (size_t i = 0; i < data_dim; ++i) {
-      starts[i] = input_pad[2 * i + 4] * (strides[i] + 1);
-      ends[i] = starts[i] + output_vec[i + 2];
-      axes[i] = i + 2;
-    }
-
-    const T* input_data = transformed_input.data<T>();
-    input_vec = phi::vectorize<int>(transformed_input.dims());
-
-    std::vector<int> transformed_output_vec = output_vec;
-    for (size_t i = 0; i < data_dim; ++i) {
-      transformed_output_vec[i + 2] =
-          output_vec[i + 2] +
-          (input_pad[2 * i + 4] + input_pad[2 * i + 5]) * strides[i] -
-          2 * padding_common[i] + paddings[2 * i] + paddings[2 * i + 1];
-    }
-
-    Tensor transformed_output;
-    if (!is_sys_pad) {
-      DDim transformed_output_shape(phi::make_ddim(transformed_output_vec));
-      transformed_output.mutable_data<T>(transformed_output_shape,
-                                         ctx.GetPlace());
-    } else {
-      output->mutable_data<T>(ctx.GetPlace());
-      transformed_output.ShareDataWith(*output);
-      transformed_output.Resize(phi::make_ddim(transformed_output_vec));
-    }
-    T* transformed_output_data = transformed_output.data<T>();
-
-    platform::DataLayout layout;
-
-    int iwo_groups = groups;
-    int c_groups = 1;
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
-    iwo_groups = 1;
-    c_groups = groups;
-    groups = 1;
-#endif
-
-    if (strides.size() == 2U) {
-      layout = platform::DataLayout::kNCHW;
-    } else {
-      layout = platform::DataLayout::kNCDHW;
-    }
-
-    size_t workspace_size = 0;
-#ifdef PADDLE_WITH_HIP
-    miopenConvBwdDataAlgorithm_t algo{};
-#else
-    cudnnConvolutionBwdDataAlgo_t algo{};
-#endif
-    // ------------------- cudnn conv algorithm ---------------------
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto handle = dev_ctx.cudnn_handle();
-    auto layout_tensor = GetCudnnTensorFormat(layout);
-    bool deterministic = FLAGS_cudnn_deterministic;
-
-    auto dtype = platform::CudnnDataType<T>::type;
-    // ------------------- cudnn descriptors ---------------------
-    ConvArgs args{&transformed_output,
-                  filter,
-                  &transformed_input,
-                  strides,
-                  padding_common,
-                  dilations,
-                  dtype};
-    args.handle = handle;
-    args.idesc.set(transformed_output, iwo_groups);
-    args.wdesc.set(*filter, layout_tensor, iwo_groups);
-    args.odesc.set(transformed_input, iwo_groups);
-    args.cdesc.set(dtype, padding_common, strides, dilations,
-                   platform::AllowTF32Cudnn(), c_groups);
-
-#ifdef PADDLE_WITH_HIP
-    using search = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
-    workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
-    algo = search::Find<T>(
-        args, false, deterministic, workspace_size,
-        ctx.template device_context<platform::CUDADeviceContext>());
-#else
-    using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-    algo = search::Find<T>(
-        args, false, deterministic,
-        ctx.template device_context<platform::CUDADeviceContext>());
-    workspace_size =
-        std::max(workspace_size, search::GetWorkspaceSize(args, algo));
-#endif
-
-    // ------------------- cudnn conv transpose forward ---------------------
-    int input_offset =
-        transformed_input.numel() / transformed_input.dims()[0] / groups;
-    int output_offset =
-        transformed_output.numel() / transformed_output.dims()[0] / groups;
-    int filter_offset = filter->numel() / groups;
-    ScalingParamType<T> alpha = 1.0f;
-    ScalingParamType<T> beta = 0.0f;
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-    for (int g = 0; g < groups; g++) {
-#ifdef PADDLE_WITH_HIP
-      auto cudnn_func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::miopenConvolutionBackwardData(
-                handle, &alpha, args.odesc.desc(),
-                input_data + input_offset * g, args.wdesc.desc(),
-                filter_data + filter_offset * g, args.cdesc.desc(), algo, &beta,
-                args.idesc.desc(), transformed_output_data + output_offset * g,
-                cudnn_workspace, workspace_size));
-      };
-#else   // PADDLE_WITH_HIP
-      auto cudnn_func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnConvolutionBackwardData(
-                handle, &alpha, args.wdesc.desc(),
-                filter_data + filter_offset * g, args.odesc.desc(),
-                input_data + input_offset * g, args.cdesc.desc(), algo,
-                cudnn_workspace, workspace_size, &beta, args.idesc.desc(),
-                transformed_output_data + output_offset * g));
-      };
-#endif  // PADDLE_WITH_HIP
-      workspace_handle.RunFunc(cudnn_func, workspace_size);
-    }
-    if (!is_sys_pad && strides.size() == 2U) {
-      Slice<paddle::platform::CUDADeviceContext, T, 4>(
-          ctx, &transformed_output, output, starts, ends, axes);
-    } else if (!is_sys_pad && strides.size() == 3U) {
-      Slice<paddle::platform::CUDADeviceContext, T, 5>(
-          ctx, &transformed_output, output, starts, ends, axes);
-    }
-
-    if (data_layout == platform::DataLayout::kNHWC) {
-      Tensor output_transpose;
-      Tensor output_nchw;
-      output_nchw.ShareDataWith(*output);
-      output_nchw.Resize(phi::make_ddim(output_vec));
-      if (strides.size() == 2U) {
-        std::vector<int> axis = {0, 2, 3, 1};
-        DataTranspose<T, 4>(ctx, &output_nchw, &output_transpose, axis);
-        *output = output_transpose;
-      } else if (strides.size() == 3U) {
-        std::vector<int> axis = {0, 2, 3, 4, 1};
-        DataTranspose<T, 5>(ctx, &output_nchw, &output_transpose, axis);
-        *output = output_transpose;
-      }
-    }
-  }
-};
-
-template <typename T>
-class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    auto input = ctx.Input<Tensor>("Input");
-    auto filter = ctx.Input<Tensor>("Filter");
-    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
-    const T* filter_data = filter->data<T>();
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    // cudnn v5 does not support dilations
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_format");
-    const paddle::platform::DataLayout data_layout =
-        (data_layout_str != "NHWC" ? platform::DataLayout::kNCHW
-                                   : platform::DataLayout::kNHWC);
-
-    // if channel_last, transpose to channel_first
-    Tensor input_transpose;
-    Tensor output_grad_transpose;
-    std::vector<int> input_vec = phi::vectorize<int>(input->dims());
-    std::vector<int> output_vec = phi::vectorize<int>(output_grad->dims());
-    if (data_layout == platform::DataLayout::kNHWC) {
-      if (strides.size() == 2U) {
-        std::vector<int> axis = {0, 3, 1, 2};
-        for (size_t i = 0; i < axis.size(); ++i) {
-          input_vec[i] = input->dims()[axis[i]];
-          output_vec[i] = output_grad->dims()[axis[i]];
-        }
-        DataTranspose<T, 4>(ctx, input, &input_transpose, axis);
-        DataTranspose<T, 4>(ctx, output_grad, &output_grad_transpose, axis);
-      } else if (strides.size() == 3U) {
-        std::vector<int> axis = {0, 4, 1, 2, 3};
-        for (size_t i = 0; i < axis.size(); ++i) {
-          input_vec[i] = input->dims()[axis[i]];
-          output_vec[i] = output_grad->dims()[axis[i]];
-        }
-        DataTranspose<T, 5>(ctx, input, &input_transpose, axis);
-        DataTranspose<T, 5>(ctx, output_grad, &output_grad_transpose, axis);
-      }
-    } else {
-      input_transpose = *input;
-      output_grad_transpose = *output_grad;
-    }
-
-    // update padding and dilation
-    auto in_dims = input_transpose.dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
-
-    std::vector<int> input_pad(input_transpose.dims().size() * 2, 0);
-    Tensor transformed_output_grad;
-    std::vector<int> padding_common(data_dim, 0);
-    if (!is_sys_pad) {
-      std::vector<int> padding_diff(data_dim);
-      std::vector<int> new_output_grad_shape_vec(data_dim + 2);
-      new_output_grad_shape_vec[0] = output_grad_transpose.dims()[0];
-      new_output_grad_shape_vec[1] = output_grad_transpose.dims()[1];
-
-      for (size_t i = 0; i < data_dim; ++i) {
-        padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
-        padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
-        new_output_grad_shape_vec[i + 2] =
-            output_grad_transpose.dims()[i + 2] + padding_diff[i];
-        input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
-        input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
-      }
-      framework::DDim new_output_grad_shape(
-          phi::make_ddim(new_output_grad_shape_vec));
-      transformed_output_grad.Resize(new_output_grad_shape);
-      auto& dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-
-      transformed_output_grad =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_output_grad_shape, dev_ctx);
-      const int rank = input_transpose.dims().size();
-      T pad_value(0.0);
-      switch (rank) {
-        case 4: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              dev_ctx, input_pad, output_grad_transpose, pad_value,
-              &transformed_output_grad);
-        } break;
-        case 5: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              dev_ctx, input_pad, output_grad_transpose, pad_value,
-              &transformed_output_grad);
-        } break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Op(ConvTranspose) only supports 4-D or 5-D input Tensor."));
-      }
-    } else {
-      transformed_output_grad = output_grad_transpose;
-      if (paddings.size() == data_dim) {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[i];
-        }
-      } else {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[2 * i];
-        }
-      }
-    }
-
-    const T* input_data = input_transpose.data<T>();
-    const T* output_grad_data = transformed_output_grad.data<T>();
-    output_vec = phi::vectorize<int>(transformed_output_grad.dims());
-
-    // ------------------- cudnn descriptors ---------------------
-    platform::DataLayout layout;
-
-    if (strides.size() == 2U) {
-      layout = platform::DataLayout::kNCHW;
-    } else {
-      layout = platform::DataLayout::kNCDHW;
-    }
-
-    int iwo_groups = groups;
-    int c_groups = 1;
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
-    iwo_groups = 1;
-    c_groups = groups;
-    groups = 1;
-#endif
-
-    auto dtype = platform::CudnnDataType<T>::type;
-
-    ConvArgs args1{&transformed_output_grad,
-                   filter,
-                   &input_transpose,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-    ConvArgs args2{&transformed_output_grad,
-                   filter,
-                   &input_transpose,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-
-#ifdef PADDLE_WITH_HIP
-    miopenConvFwdAlgorithm_t data_algo{};
-    miopenConvBwdWeightsAlgorithm_t filter_algo{};
-#else
-    cudnnConvolutionFwdAlgo_t data_algo{};
-    cudnnConvolutionBwdFilterAlgo_t filter_algo{};
-#endif
-
-    auto layout_tensor = GetCudnnTensorFormat(layout);
-    size_t workspace_size = 0;
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto handle = dev_ctx.cudnn_handle();
-    bool deterministic = FLAGS_cudnn_deterministic;
-    T* input_grad_data = nullptr;
-    T* filter_grad_data = nullptr;
-
-    if (input_grad) {
-      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      args1.handle = handle;
-      args1.idesc.set(transformed_output_grad, iwo_groups);
-      args1.wdesc.set(*filter, layout_tensor, iwo_groups);
-      args1.odesc.set(input_transpose, iwo_groups);
-      args1.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_groups);
-#ifdef PADDLE_WITH_HIP
-      using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
-      workspace_size =
-          std::max(workspace_size, search1::GetWorkspaceSize(args1));
-      data_algo = search1::Find<T>(
-          args1, false, deterministic, workspace_size,
-          ctx.template device_context<platform::CUDADeviceContext>());
-#else
-      using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-      data_algo = search1::Find<T>(
-          args1, false, deterministic,
-          ctx.template device_context<platform::CUDADeviceContext>());
-      workspace_size =
-          std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
-#endif
-    }
-
-    if (filter_grad) {
-      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-      args2.handle = handle;
-      args2.idesc.set(transformed_output_grad, iwo_groups);
-      args2.wdesc.set(*filter_grad, layout_tensor, iwo_groups);
-      args2.odesc.set(input_transpose, iwo_groups);
-      args2.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_groups);
-#ifdef PADDLE_WITH_HIP
-      using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
-      workspace_size =
-          std::max(workspace_size, search2::GetWorkspaceSize(args2));
-      filter_algo = search2::Find<T>(
-          args2, false, deterministic, workspace_size,
-          ctx.template device_context<platform::CUDADeviceContext>());
-#else
-      using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo = search2::Find<T>(
-          args2, false, deterministic,
-          ctx.template device_context<platform::CUDADeviceContext>());
-      workspace_size = std::max(workspace_size,
-                                search2::GetWorkspaceSize(args2, filter_algo));
-#endif
-    }
-
-    // ------------------- cudnn conv backward data ---------------------
-    // FIXME(typhoonzero): template type T may not be the same as cudnn call.
-    int input_offset = input->numel() / input->dims()[0] / groups;
-    int output_grad_offset = transformed_output_grad.numel() /
-                             transformed_output_grad.dims()[0] / groups;
-    int filter_offset = filter->numel() / groups;
-    ScalingParamType<T> alpha = 1.0f;
-    ScalingParamType<T> beta = 0.0f;
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-    if (input_grad) {
-      // Because beta is zero, it is unnecessary to reset input_grad.
-      for (int g = 0; g < groups; g++) {
-#ifdef PADDLE_WITH_HIP
-        auto cudnn_func = [&](void* cudnn_workspace) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::miopenConvolutionForward(
-                  handle, &alpha, args1.idesc.desc(),
-                  output_grad_data + output_grad_offset * g, args1.wdesc.desc(),
-                  filter_data + filter_offset * g, args1.cdesc.desc(),
-                  data_algo, &beta, args1.odesc.desc(),
-                  input_grad_data + input_offset * g, cudnn_workspace,
-                  workspace_size));
-        };
-#else   // PADDLE_WITH_HIP
-        auto cudnn_func = [&](void* cudnn_workspace) {
-          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnConvolutionForward(
-              handle, &alpha, args1.idesc.desc(),
-              output_grad_data + output_grad_offset * g, args1.wdesc.desc(),
-              filter_data + filter_offset * g, args1.cdesc.desc(), data_algo,
-              cudnn_workspace, workspace_size, &beta, args1.odesc.desc(),
-              input_grad_data + input_offset * g));
-        };
-#endif  // PADDLE_WITH_HIP
-        workspace_handle.RunFunc(cudnn_func, workspace_size);
-      }
-
-      if (data_layout == platform::DataLayout::kNHWC) {
-        Tensor input_grad_transpose;
-        Tensor input_grad_nchw;
-        input_grad_nchw.ShareDataWith(*input_grad);
-        input_grad_nchw.Resize(phi::make_ddim(input_vec));
-        if (strides.size() == 2U) {
-          std::vector<int> axis = {0, 2, 3, 1};
-          DataTranspose<T, 4>(ctx, &input_grad_nchw, &input_grad_transpose,
-                              axis);
-          *input_grad = input_grad_transpose;
-        } else if (strides.size() == 3U) {
-          std::vector<int> axis = {0, 2, 3, 4, 1};
-          DataTranspose<T, 5>(ctx, &input_grad_nchw, &input_grad_transpose,
-                              axis);
-          *input_grad = input_grad_transpose;
-        }
-      }
-    }
-
-    // ------------------- cudnn conv backward filter ---------------------
-    if (filter_grad) {
-      // Because beta is zero, it is unnecessary to reset filter_grad.
-      // Gradient with respect to the filter
-      for (int g = 0; g < groups; g++) {
-#ifdef PADDLE_WITH_HIP
-        auto cudnn_func = [&](void* cudnn_workspace) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::miopenConvolutionBackwardWeights(
-                  handle, &alpha, args2.odesc.desc(),
-                  input_data + input_offset * g, args2.idesc.desc(),
-                  output_grad_data + output_grad_offset * g, args2.cdesc.desc(),
-                  filter_algo, &beta, args2.wdesc.desc(),
-                  filter_grad_data + filter_offset * g, cudnn_workspace,
-                  workspace_size));
-        };
-#else   // PADDLE_WITH_HIP
-        auto cudnn_func = [&](void* cudnn_workspace) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::cudnnConvolutionBackwardFilter(
-                  handle, &alpha, args2.idesc.desc(),
-                  output_grad_data + output_grad_offset * g, args2.odesc.desc(),
-                  input_data + input_offset * g, args2.cdesc.desc(),
-                  filter_algo, cudnn_workspace, workspace_size, &beta,
-                  args2.wdesc.desc(), filter_grad_data + filter_offset * g));
-        };
-#endif  // PADDLE_WITH_HIP
-        workspace_handle.RunFunc(cudnn_func, workspace_size);
-      }
-    }
-  }
-};
-
-/*
- * Inputs:  I, W, dO, ddI, ddW
- * Outputs: ddO, dW, dI
- * ddo = conv_bp_data(W, ddI) + conv_bp_data(ddW, I)
- * dW = conv_bp_filter(dO, ddI)
- * dI = conv(dO, ddW)
- */
-template <typename T>
-class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    auto X = ctx.Input<Tensor>("Input");
-    auto W = ctx.Input<Tensor>("Filter");
-    auto dO = ctx.Input<Tensor>("DOutput");
-    auto ddX = ctx.Input<Tensor>("DDInput");
-    auto ddW = ctx.Input<Tensor>("DDFilter");
-
-    auto ddO = ctx.Output<Tensor>("DDOutput");
-    auto dW = ctx.Output<Tensor>("DFilter");
-    auto dX = ctx.Output<Tensor>("DInput");
-
-    if (ddO) {
-      ddO->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-      set_zero(dev_ctx, ddO, static_cast<T>(0));
-    }
-    if (dW) {
-      dW->mutable_data<T>(ctx.GetPlace());
-    }
-    if (dX) {
-      dX->mutable_data<T>(ctx.GetPlace());
-    }
-
-    const T* dy = dO->data<T>();
-    const T* w = W->data<T>();
-
-    const T* ddx = nullptr;
-    const T* ddw = nullptr;
-    T *dw, *dx, *ddy;
-    dw = dx = ddy = nullptr;
-    T* transformed_dx = nullptr;
-    const std::vector<int>& strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-
-    bool deterministic = FLAGS_cudnn_deterministic;
-
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    // transform Tensors to channel first-----------
-    Tensor transformed_X_channel(X->type());
-    Tensor transformed_dO_channel(dO->type());
-    Tensor transformed_ddX_channel(X->type());
-
-    Tensor transformed_ddO_channel(dO->type());
-    Tensor transformed_dX_channel(X->type());
-
-    if (channel_last) {
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, X, &transformed_X_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, X, &transformed_X_channel);
-
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, dO, &transformed_dO_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, dO, &transformed_dO_channel);
-
-      if (ddX) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, ddX, &transformed_ddX_channel);
-        TransToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, ddX, &transformed_ddX_channel);
-      }
-
-      if (ddO) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, ddO, &transformed_ddO_channel);
-      }
-      if (dX) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, dX, &transformed_dX_channel);
-        transformed_dX_channel.mutable_data<T>(ctx.GetPlace());
-      }
-
-    } else {
-      transformed_X_channel = *X;
-      transformed_dO_channel = *dO;
-      if (ddX) {
-        transformed_ddX_channel = *ddX;
-      }
-      if (dX) {
-        transformed_dX_channel = *dX;
-      }
-    }
-    std::vector<int> output_vec =
-        phi::vectorize<int>(transformed_dO_channel.dims());
-
-    auto in_dims = transformed_X_channel.dims();
-    auto filter_dims = W->dims();
-    framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
-    Tensor transformed_X(X->type());
-    Tensor transformed_ddX(X->type());
-
-    Tensor transformed_dO(dO->type());
-
-    std::vector<int> padding_common(data_dim, 0);
-    std::vector<int> input_pad(X->dims().size() * 2, 0);
-
-    if (!is_sys_pad) {
-      // get pad
-      std::vector<int> padding_diff(data_dim);
-      std::vector<int> new_input_shape_vec(data_dim + 2);
-      std::vector<int> new_output_grad_shape_vec(data_dim + 2);
-
-      new_input_shape_vec[0] = transformed_X_channel.dims()[0];
-      new_input_shape_vec[1] = transformed_X_channel.dims()[1];
-
-      new_output_grad_shape_vec[0] = transformed_dO_channel.dims()[0];
-      new_output_grad_shape_vec[1] = transformed_dO_channel.dims()[1];
-
-      for (size_t i = 0; i < data_dim; ++i) {
-        padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
-        padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
-        new_input_shape_vec[i + 2] =
-            transformed_X_channel.dims()[i + 2] + padding_diff[i];
-
-        new_output_grad_shape_vec[i + 2] =
-            transformed_dO_channel.dims()[i + 2] + padding_diff[i];
-
-        input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
-        input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
-      }
-      framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
-      transformed_X.Resize(new_input_shape);
-      transformed_ddX.Resize(new_input_shape);
-
-      framework::DDim new_output_grad_shape(
-          phi::make_ddim(new_output_grad_shape_vec));
-      transformed_dO.Resize(new_output_grad_shape);
-
-      transformed_dO =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_output_grad_shape, dev_ctx);
-
-      transformed_X =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_input_shape, dev_ctx);
-      if (ddX) {
-        transformed_ddX =
-            ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-                new_input_shape, dev_ctx);
-      }
-
-      // pad for input
-      const int rank = X->dims().size();
-      T pad_value(0.0);
-      switch (rank) {
-        case 4: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              dev_ctx, input_pad, transformed_X_channel, pad_value,
-              &transformed_X);
-          if (dO) {
-            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-                dev_ctx, input_pad, transformed_dO_channel, pad_value,
-                &transformed_dO);
-          }
-
-          if (ddX) {
-            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-                dev_ctx, input_pad, transformed_ddX_channel, pad_value,
-                &transformed_ddX);
-          }
-        } break;
-        case 5: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              dev_ctx, input_pad, transformed_X_channel, pad_value,
-              &transformed_X);
-          if (ddX) {
-            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-                dev_ctx, input_pad, transformed_ddX_channel, pad_value,
-                &transformed_ddX);
-          }
-        } break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "ConvOp only support tensors with 4 or 5 dimensions."));
-      }
-
-    } else {
-      transformed_X = transformed_X_channel;
-      transformed_dO = transformed_dO_channel;
-      if (ddX) {
-        transformed_ddX = transformed_ddX_channel;
-      }
-
-      if (paddings.size() == data_dim) {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[i];
-        }
-      } else {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[2 * i];
-        }
-      }
-    }
-
-    std::vector<int64_t> starts(data_dim, 0);
-    std::vector<int64_t> ends(data_dim, 0);
-    std::vector<int64_t> axes(data_dim, 0);
-    for (size_t i = 0; i < data_dim; ++i) {
-      starts[i] = input_pad[2 * i + 4] * (strides[i] + 1);
-      ends[i] = starts[i] + output_vec[i + 2];
-      axes[i] = i + 2;
-    }
-
-    std::vector<int> transformed_output_vec = output_vec;
-    for (size_t i = 0; i < data_dim; ++i) {
-      transformed_output_vec[i + 2] =
-          output_vec[i + 2] +
-          (input_pad[2 * i + 4] + input_pad[2 * i + 5]) * strides[i] -
-          2 * padding_common[i] + paddings[2 * i] + paddings[2 * i + 1];
-    }
-
-    if (!is_sys_pad) {
-      DDim transformed_output_shape(phi::make_ddim(transformed_output_vec));
-      transformed_ddO_channel.mutable_data<T>(transformed_output_shape,
-                                              ctx.GetPlace());
-    } else {
-      ddO->mutable_data<T>(ctx.GetPlace());
-      transformed_ddO_channel = *ddO;
-      transformed_ddO_channel.Resize(phi::make_ddim(transformed_output_vec));
-    }
-
-    const T* x = transformed_X.data<T>();
-
-    int iwo_group = groups;
-    int c_group = 1;
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
-    iwo_group = 1;
-    c_group = groups;
-    groups = 1;
-#endif
-    auto dtype = platform::CudnnDataType<T>::type;
-
-    auto handle = dev_ctx.cudnn_handle();
-
-    ConvArgs args1{&transformed_ddO_channel,
-                   W,
-                   &transformed_ddX,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-    ConvArgs args2{&transformed_ddO_channel, ddW,       &transformed_X, strides,
-                   padding_common,           dilations, dtype};
-
-    ConvArgs args3{&transformed_dO,
-                   dW,
-                   &transformed_ddX_channel,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-    ConvArgs args4{
-        &transformed_dO, ddW,  &transformed_dX_channel, strides, padding_common,
-        dilations,       dtype};
-#ifdef PADDLE_WITH_HIP
-    miopenConvBwdDataAlgorithm_t bwd_algo1 =
-        static_cast<miopenConvBwdDataAlgorithm_t>(0);
-    miopenConvBwdDataAlgorithm_t bwd_algo2 =
-        static_cast<miopenConvBwdDataAlgorithm_t>(0);
-    miopenConvFwdAlgorithm_t data_algo =
-        static_cast<miopenConvFwdAlgorithm_t>(0);
-    miopenConvBwdWeightsAlgorithm_t filter_algo =
-        static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
-#else
-    cudnnConvolutionBwdDataAlgo_t bwd_algo1 =
-        static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
-    cudnnConvolutionBwdDataAlgo_t bwd_algo2 =
-        static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
-    cudnnConvolutionFwdAlgo_t data_algo =
-        static_cast<cudnnConvolutionFwdAlgo_t>(0);
-    cudnnConvolutionBwdFilterAlgo_t filter_algo =
-        static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
-#endif
-
-    auto layout = GetCudnnTensorFormat(platform::DataLayout::kNCHW);
-
-    // ddo = conv(ddI, W) + conv(I, ddW)
-    size_t workspace_size = 0;
-
-    T* transformed_ddy_channel = nullptr;
-
-    if (ddO) {
-      ddy = ddO->data<T>();
-      transformed_ddy_channel = transformed_ddO_channel.data<T>();
-      if (ddX) {
-        args1.handle = handle;
-        args1.idesc.set(transformed_ddO_channel, iwo_group);
-        args1.wdesc.set(*W, layout, iwo_group);
-        args1.odesc.set(transformed_ddX, iwo_group);
-        args1.cdesc.set(dtype, padding_common, strides, dilations,
-                        platform::AllowTF32Cudnn(), c_group);
-#ifdef PADDLE_WITH_HIP
-        using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
-        workspace_size = search1::GetWorkspaceSize(args1);
-        bwd_algo1 = search1::Find<T>(
-            args1, false, deterministic, workspace_size,
-            ctx.template device_context<platform::CUDADeviceContext>());
-#else
-        using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-        bwd_algo1 = search1::Find<T>(
-            args1, false, deterministic,
-            ctx.template device_context<platform::CUDADeviceContext>());
-        workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1);
-#endif
-      }
-
-      if (ddW) {
-        ddw = ddW->data<T>();
-        args2.handle = handle;
-        args2.idesc.set(transformed_ddO_channel, iwo_group);
-        args2.wdesc.set(*ddW, layout, iwo_group);
-        args2.odesc.set(transformed_X, iwo_group);
-        args2.cdesc.set(dtype, padding_common, strides, dilations,
-                        platform::AllowTF32Cudnn(), c_group);
-#ifdef PADDLE_WITH_HIP
-        using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
-        workspace_size =
-            std::max(workspace_size, search2::GetWorkspaceSize(args2));
-        bwd_algo2 = search2::Find<T>(
-            args2, false, deterministic, workspace_size,
-            ctx.template device_context<platform::CUDADeviceContext>());
-#else
-        using search2 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-        bwd_algo2 = search2::Find<T>(
-            args2, false, deterministic,
-            ctx.template device_context<platform::CUDADeviceContext>());
-        workspace_size = std::max(workspace_size,
-                                  search2::GetWorkspaceSize(args2, bwd_algo2));
-#endif
-      }
-    }
-
-    if (dW && ddX) {
-      dw = dW->data<T>();
-      args3.handle = handle;
-      args3.idesc.set(transformed_dO, iwo_group);
-      args3.wdesc.set(*dW, layout, iwo_group);
-
-      args3.odesc.set(transformed_ddX_channel, iwo_group);
-
-      args3.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_group);
-#ifdef PADDLE_WITH_HIP
-      using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
-      workspace_size =
-          std::max(workspace_size, search3::GetWorkspaceSize(args3));
-      filter_algo = search3::Find<T>(
-          args3, false, deterministic, workspace_size,
-          ctx.template device_context<platform::CUDADeviceContext>());
-#else
-      using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo = search3::Find<T>(
-          args3, false, deterministic,
-          ctx.template device_context<platform::CUDADeviceContext>());
-      workspace_size = std::max(workspace_size,
-                                search3::GetWorkspaceSize(args3, filter_algo));
-#endif
-    }
-
-    if (ddW && dX) {
-      transformed_dx = transformed_dX_channel.data<T>();
-
-      args4.handle = handle;
-      args4.idesc.set(transformed_dO, iwo_group);
-      args4.wdesc.set(*ddW, layout, iwo_group);
-      args4.odesc.set(transformed_dX_channel, iwo_group);
-      args4.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_group);
-#ifdef PADDLE_WITH_HIP
-      using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
-      workspace_size =
-          std::max(workspace_size, search4::GetWorkspaceSize(args4));
-      data_algo = search4::Find<T>(
-          args4, false, deterministic, workspace_size,
-          ctx.template device_context<platform::CUDADeviceContext>());
-#else
-      using search4 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-      data_algo = search4::Find<T>(
-          args4, false, deterministic,
-          ctx.template device_context<platform::CUDADeviceContext>());
-      workspace_size =
-          std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
-#endif
-    }
-
-    int i_n, i_c, i_d, i_h, i_w;
-    GetNCDHW(transformed_X.dims(), platform::DataLayout::kNCHW, &i_n, &i_c,
-             &i_d, &i_h, &i_w);
-
-    int o_n, o_c, o_d, o_h, o_w;
-    GetNCDHW(transformed_dO.dims(), platform::DataLayout::kNCHW, &o_n, &o_c,
-             &o_d, &o_h, &o_w);
-
-    int group_offset_in =
-        transformed_X.numel() / transformed_X.dims()[0] / groups;
-    int group_offset_out =
-        transformed_dO.numel() / transformed_dO.dims()[0] / groups;
-    int group_offset_filter = W->numel() / groups;
-
-    ScalingParamType<T> alpha = 1.0f;
-    ScalingParamType<T> beta = 0.0f;
-
-    auto wkspace_handle = dev_ctx.cudnn_workspace_handle();
-
-    if (ddO) {
-      if (ddX) {
-        ddx = transformed_ddX.data<T>();
-        for (int i = 0; i < groups; i++) {
-#ifdef PADDLE_WITH_HIP
-          wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    platform::dynload::miopenConvolutionBackwardData(
-                        handle, &alpha, args1.odesc.desc(),
-                        ddx + i * group_offset_in, args1.wdesc.desc(),
-                        w + i * group_offset_filter, args1.cdesc.desc(),
-                        bwd_algo1, &beta, args1.idesc.desc(),
-                        transformed_ddy_channel + i * group_offset_out,
-                        workspace_ptr, workspace_size));
-              },
-              workspace_size);
-#else   // PADDLE_WITH_HIP
-          wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    platform::dynload::cudnnConvolutionBackwardData(
-                        handle, &alpha, args1.wdesc.desc(),
-                        w + i * group_offset_filter, args1.odesc.desc(),
-                        ddx + i * group_offset_in, args1.cdesc.desc(),
-                        bwd_algo1, workspace_ptr, workspace_size, &beta,
-                        args1.idesc.desc(),
-                        transformed_ddy_channel + i * group_offset_out));
-              },
-              workspace_size);
-#endif  // PADDLE_WITH_HIP
-        }
-      }
-      if (ddW) {
-        for (int i = 0; i < groups; i++) {
-#ifdef PADDLE_WITH_HIP
-          // MIOPEN ONLY support beta to be 0.0f
-          Tensor conv_x_ddw(dO->type());
-          conv_x_ddw.Resize(transformed_ddO_channel.dims());
-          T* conv_x_ddw_data = conv_x_ddw.mutable_data<T>(ctx.GetPlace());
-          wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    platform::dynload::miopenConvolutionBackwardData(
-                        handle, &alpha, args2.odesc.desc(),
-                        x + i * group_offset_in, args2.wdesc.desc(),
-                        ddw + i * group_offset_filter, args2.cdesc.desc(),
-                        bwd_algo2, &beta, args2.idesc.desc(),
-                        conv_x_ddw_data + i * group_offset_out, workspace_ptr,
-                        workspace_size));
-              },
-              workspace_size);
-          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor(
-              handle, miopenTensorOpAdd, &alpha, args2.idesc.desc(),
-              transformed_ddy_channel + i * group_offset_out, &alpha,
-              args2.idesc.desc(), conv_x_ddw_data + i * group_offset_out, &beta,
-              args2.idesc.desc(),
-              transformed_ddy_channel + i * group_offset_out));
-#else   // PADDLE_WITH_HIP
-          wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    platform::dynload::cudnnConvolutionBackwardData(
-                        handle, &alpha, args2.wdesc.desc(),
-                        ddw + i * group_offset_filter, args2.odesc.desc(),
-                        x + i * group_offset_in, args2.cdesc.desc(), bwd_algo2,
-                        workspace_ptr, workspace_size, &alpha,
-                        args2.idesc.desc(),
-                        transformed_ddy_channel + i * group_offset_out));
-              },
-              workspace_size);
-#endif  // PADDLE_WITH_HIP
-        }
-      }
-      if ((!is_sys_pad) && (!channel_last)) {
-        if (strides.size() == 2U) {
-          Slice<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, &transformed_ddO_channel, ddO, starts, ends, axes);
-        } else if (!is_sys_pad && strides.size() == 3U) {
-          Slice<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, &transformed_ddO_channel, ddO, starts, ends, axes);
-        }
-      } else if ((!is_sys_pad) && (channel_last)) {
-        if (strides.size() == 2U) {
-          Slice<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, &transformed_ddO_channel, &transformed_ddO_channel, starts,
-              ends, axes);
-        } else if (!is_sys_pad && strides.size() == 3U) {
-          Slice<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, &transformed_ddO_channel, &transformed_ddO_channel, starts,
-              ends, axes);
-        }
-
-        TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-            ctx, &transformed_ddO_channel, ddO);
-      }
-    }
-
-    T* transformed_dy_channel = transformed_dO.data<T>();
-    if (dW && ddX) {
-      ddx = transformed_ddX_channel.data<T>();
-      for (int i = 0; i < groups; i++) {
-#ifdef PADDLE_WITH_HIP
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::miopenConvolutionBackwardWeights(
-                      handle, &alpha, args3.odesc.desc(),
-                      ddx + i * group_offset_in, args3.idesc.desc(),
-                      transformed_dy_channel + i * group_offset_out,
-                      args3.cdesc.desc(), filter_algo, &beta,
-                      args3.wdesc.desc(), dw + i * group_offset_filter,
-                      workspace_ptr, workspace_size));
-            },
-            workspace_size);
-#else   // PADDLE_WITH_HIP
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnConvolutionBackwardFilter(
-                      handle, &alpha, args3.idesc.desc(),
-                      transformed_dy_channel + i * group_offset_out,
-                      args3.odesc.desc(), ddx + i * group_offset_in,
-                      args3.cdesc.desc(), filter_algo, workspace_ptr,
-                      workspace_size, &beta, args3.wdesc.desc(),
-                      dw + i * group_offset_filter));
-            },
-            workspace_size);
-#endif  // PADDLE_WITH_HIP
-      }
-    }
-
-    if (dX && ddW) {
-      ddw = ddW->data<T>();
-      for (int i = 0; i < groups; i++) {
-#ifdef PADDLE_WITH_HIP
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::miopenConvolutionForward(
-                      handle, &alpha, args4.idesc.desc(),
-                      transformed_dy_channel + i * group_offset_out,
-                      args4.wdesc.desc(), ddw + i * group_offset_filter,
-                      args4.cdesc.desc(), data_algo, &beta, args4.odesc.desc(),
-                      transformed_dx + i * group_offset_in, workspace_ptr,
-                      workspace_size));
-            },
-            workspace_size);
-#else   // PADDLE_WITH_HIP
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnConvolutionForward(
-                      handle, &alpha, args4.idesc.desc(),
-                      transformed_dy_channel + i * group_offset_out,
-                      args4.wdesc.desc(), ddw + i * group_offset_filter,
-                      args4.cdesc.desc(), data_algo, workspace_ptr,
-                      workspace_size, &beta, args4.odesc.desc(),
-                      transformed_dx + i * group_offset_in));
-            },
-            workspace_size);
-#endif  // PADDLE_WITH_HIP
-      }
-      if (channel_last) {
-        TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-            ctx, &transformed_dX_channel, dX);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeOpKernel<plat::float16>,
-                   ops::CUDNNConvTransposeOpKernel<float>);
-REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeGradOpKernel<plat::float16>,
-                   ops::CUDNNConvTransposeGradOpKernel<float>);
-REGISTER_OP_KERNEL(
-    conv2d_transpose_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvTransposeDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvTransposeDoubleGradOpKernel<plat::float16>);
-
-REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeOpKernel<plat::float16>,
-                   ops::CUDNNConvTransposeOpKernel<float>);
-REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeGradOpKernel<plat::float16>,
-                   ops::CUDNNConvTransposeGradOpKernel<float>);
-#else
-REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeOpKernel<plat::float16>,
-                   ops::CUDNNConvTransposeOpKernel<float>,
-                   ops::CUDNNConvTransposeOpKernel<double>);
-REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeGradOpKernel<plat::float16>,
-                   ops::CUDNNConvTransposeGradOpKernel<float>,
-                   ops::CUDNNConvTransposeGradOpKernel<double>);
-REGISTER_OP_KERNEL(
-    conv2d_transpose_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvTransposeDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvTransposeDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvTransposeDoubleGradOpKernel<plat::float16>);
-
-REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeOpKernel<plat::float16>,
-                   ops::CUDNNConvTransposeOpKernel<float>,
-                   ops::CUDNNConvTransposeOpKernel<double>);
-REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeGradOpKernel<plat::float16>,
-                   ops::CUDNNConvTransposeGradOpKernel<float>,
-                   ops::CUDNNConvTransposeGradOpKernel<double>);
-#endif
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 86532664985b4..fe76fc3aebbc1 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -13,13 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/conv_transpose_op.h"
-#include <memory>
+
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
-
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/binary.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -29,165 +33,6 @@ namespace operators {
 
 using DataLayout = framework::DataLayout;
 
-void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
-  OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ConvTranspose");
-  OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter", "ConvTranspose");
-  OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "ConvTranspose");
-
-  auto in_dims = ctx->GetInputDim("Input");
-  auto filter_dims = ctx->GetInputDim("Filter");
-  std::vector<int> output_size =
-      ctx->Attrs().Get<std::vector<int>>("output_size");
-  std::vector<int> output_padding =
-      ctx->Attrs().Get<std::vector<int>>("output_padding");
-  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-  std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
-  int groups = ctx->Attrs().Get<int>("groups");
-  std::string padding_algorithm =
-      ctx->Attrs().Get<std::string>("padding_algorithm");
-  const std::string data_layout_str =
-      ctx->Attrs().Get<std::string>("data_format");
-  const DataLayout data_layout =
-      ctx->IsRunMKLDNNKernel() ? DataLayout::kNCHW
-                               : framework::StringToDataLayout(data_layout_str);
-
-  PADDLE_ENFORCE_EQ(in_dims.size() == 4 || in_dims.size() == 5, true,
-                    platform::errors::InvalidArgument(
-                        "Input of Op(conv_transpose) should be 4-D or "
-                        "5-D Tensor. But received: %u-D Tensor, "
-                        "the shape of input is [%s]",
-                        in_dims.size(), in_dims));
-  PADDLE_ENFORCE_EQ(
-      in_dims.size(), filter_dims.size(),
-      platform::errors::InvalidArgument(
-          "The input's dimension size and filter's dimension size of "
-          "Op (conv_transpose) should be equal. But received: the shape of "
-          "input is [%s], the dimension size of input is [%d], the shape "
-          "of filter is [%s],  the dimension size of filter is [%d]. ",
-          in_dims, in_dims.size(), filter_dims, filter_dims.size()));
-
-  int stride_size = strides.size();
-  for (int i = 0; i < stride_size; ++i) {
-    PADDLE_ENFORCE_GT(
-        strides[i], 0,
-        platform::errors::InvalidArgument(
-            "The stride of Op(Conv) should be larget than 0, but received "
-            "stride is %d.",
-            strides[i]));
-  }
-
-  int in_sub_stride_size = in_dims.size() - stride_size;
-
-  PADDLE_ENFORCE_EQ(
-      in_dims.size() - strides.size(), 2U,
-      platform::errors::InvalidArgument(
-          "The input's dimension size minus Attr(stride)'s size must "
-          "be euqal to 2 for Op(conv_transpose). But received: [%d], the "
-          "input's dimension size is [%d], the shape of input "
-          "is [%s], the Attr(stride)'s size is [%d].",
-          in_sub_stride_size, in_dims.size(), in_dims, strides.size()));
-  if (output_size.size())
-    PADDLE_ENFORCE_EQ(
-        output_size.size(), strides.size(),
-        platform::errors::InvalidArgument(
-            "The Attr(output_size) and Attr(stride) of Op(conv_transpose) "
-            "should be the same."));
-  if (output_padding.size())
-    PADDLE_ENFORCE_EQ(
-        output_padding.size(), strides.size(),
-        platform::errors::InvalidArgument(
-            "The Attr(output_padding) and Attr(stride) of Op(conv_transpose) "
-            "should be the same."));
-
-  const int64_t C =
-      (data_layout != DataLayout::kNHWC ? in_dims[1]
-                                        : in_dims[in_dims.size() - 1]);
-  PADDLE_ENFORCE_EQ(
-      C, filter_dims[0],
-      platform::errors::InvalidArgument(
-          "The number of input channels should be equal to filter channels "
-          "for Op(conv_transpose). But received: the input's channels is "
-          "[%d], the shape of input is [%s], the filter's channels is [%d], "
-          "the shape of filter is [%s]. The data_format is %s."
-          "The error may come from wrong data_format setting.",
-          C, in_dims, filter_dims[0], filter_dims, data_layout_str));
-
-  framework::DDim in_data_dims;
-  if (data_layout != DataLayout::kNHWC) {
-    in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-  } else {
-    in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-  }
-  framework::DDim filter_data_dims =
-      phi::slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-  UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                           in_data_dims, strides, ksize);
-
-  std::vector<int64_t> output_shape({in_dims[0]});
-  if (data_layout != DataLayout::kNHWC) {
-    output_shape.push_back(filter_dims[1] * groups);
-  }
-  const int offset = (data_layout != DataLayout::kNHWC ? 2 : 1);
-  for (size_t i = 0; i < strides.size(); ++i) {
-    auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
-    auto infer_shape = (ctx->IsRuntime() || in_dims[i + offset] > 0)
-                           ? (in_dims[i + offset] - 1) * strides[i] -
-                                 paddings[2 * i] - paddings[2 * i + 1] +
-                                 filter_extent
-                           : -1;
-    if (output_size.size()) {
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_GE(
-            output_size[i], infer_shape,
-            platform::errors::InvalidArgument(
-                "output_size of Op(ConvTransposeOp) should not be "
-                "less than the infered output size. But received output_size = "
-                "[%s], whose dim %d is less than the infered output size [%s]",
-                phi::make_ddim(output_size).to_str(), i, infer_shape));
-        PADDLE_ENFORCE_LT(
-            output_size[i], infer_shape + strides[i],
-            platform::errors::InvalidArgument(
-                "output_size of Op(ConvTransposeOp) should be less "
-                "than infered size + stride. But received output_size = [%s], "
-                "whose dim %d is not less than the infered output size (%d) + "
-                "stride (%d) = %d",
-                phi::make_ddim(output_size).to_str(), i, infer_shape,
-                strides[i], infer_shape + strides[i]));
-      }
-      output_shape.push_back(output_size[i]);
-    } else if (output_padding.size()) {
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_GE(
-            output_padding[i], 0,
-            platform::errors::InvalidArgument(
-                "output_padding of Op(ConvTransposeOp) should not be "
-                "less than the 0. But received output_padding = "
-                "[%s], whose dim %d is less than 0",
-                phi::make_ddim(output_padding).to_str(), i));
-        PADDLE_ENFORCE_LT(
-            output_padding[i], std::max(strides[i], dilations[i]),
-            platform::errors::InvalidArgument(
-                "output_padding of Op(ConvTransposeOp) should be less "
-                "than either stride or dilation. But received output_size = "
-                "[%s], "
-                "whose dim %d is not less than either stride (%d)  or "
-                "dilation (%d)",
-                phi::make_ddim(output_size).to_str(), i, strides[i],
-                dilations[i]));
-      }
-      output_shape.push_back((infer_shape + output_padding[i]));
-    } else {
-      output_shape.push_back(infer_shape);
-    }
-  }
-  if (data_layout == DataLayout::kNHWC) {
-    output_shape.push_back(filter_dims[1] * groups);
-  }
-  ctx->SetOutputDim("Output", phi::make_ddim(output_shape));
-}
-
 framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
@@ -217,7 +62,7 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
 }
 
 framework::OpKernelType ConvTransposeOp::GetKernelTypeForVar(
-    const std::string& var_name, const Tensor& tensor,
+    const std::string& var_name, const framework::Tensor& tensor,
     const framework::OpKernelType& expected_kernel_type) const {
 #ifdef PADDLE_WITH_MKLDNN
   // Only input require reshaping, weights and
@@ -493,17 +338,6 @@ The input(X) size and output(Out) size may be different.
 )DOC");
 }
 
-void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  auto in_dims = ctx->GetInputDim("Input");
-  auto filter_dims = ctx->GetInputDim("Filter");
-  if (ctx->HasOutput(framework::GradVarName("Input"))) {
-    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
-  }
-  if (ctx->HasOutput(framework::GradVarName("Filter"))) {
-    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
-  }
-}
-
 framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   bool use_cudnn =
@@ -587,24 +421,6 @@ class ConvTransposeDoubleGradMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-void ConvTransposeOpDoubleGrad::InferShape(
-    framework::InferShapeContext* ctx) const {
-  auto x_dims = ctx->GetInputDim("Input");
-  auto w_dims = ctx->GetInputDim("Filter");
-  auto do_dims = ctx->GetInputDim("DOutput");
-
-  if (ctx->HasOutput("DDOutput") &&
-      (ctx->HasInput("DDInput") || (ctx->HasInput("DDFilter")))) {
-    ctx->SetOutputDim("DDOutput", do_dims);
-  }
-  if (ctx->HasOutput("DFilter") && ctx->HasInput("DDInput")) {
-    ctx->SetOutputDim("DFilter", w_dims);
-  }
-  if (ctx->HasOutput("DInput") && ctx->HasInput("DDFilter")) {
-    ctx->SetOutputDim("DInput", x_dims);
-  }
-}
-
 framework::OpKernelType ConvTransposeOpDoubleGrad::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   bool use_cudnn =
@@ -635,59 +451,57 @@ framework::OpKernelType ConvTransposeOpDoubleGrad::GetExpectedKernelType(
 namespace ops = paddle::operators;
 
 // conv2d_transpose
+DECLARE_INFER_SHAPE_FUNCTOR(conv2d_transpose, Conv2dTranposeInferShapeFunctor,
+                            PD_INFER_META(phi::ConvTransposeInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(conv2d_transpose_grad,
+                            Conv2dTranposeGradInferShapeFunctor,
+                            PD_INFER_META(phi::ConvTransposeGradInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(
+    conv2d_transpose_grad_grad, Conv2dTranposeDoubleGradInferShapeFunctor,
+    PD_INFER_META(phi::Conv2dTransposeDoubleGradInferMeta));
+
 REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp,
                   ops::Conv2DTransposeOpMaker,
                   ops::ConvTransposeGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ConvTransposeGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(
-    conv2d_transpose_grad, ops::ConvTransposeOpGrad,
-    ops::ConvTransposeDoubleGradMaker<paddle::framework::OpDesc>,
-    ops::ConvTransposeDoubleGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(conv2d_transpose_grad_grad, ops::ConvTransposeOpDoubleGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
-                                     double>);
+                  ops::ConvTransposeGradOpMaker<paddle::imperative::OpBase>,
+                  Conv2dTranposeInferShapeFunctor);
+REGISTER_OPERATOR(conv2d_transpose_grad, ops::ConvTransposeOpGrad,
+                  ops::ConvTransposeDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::ConvTransposeDoubleGradMaker<paddle::imperative::OpBase>,
+                  Conv2dTranposeGradInferShapeFunctor);
+REGISTER_OPERATOR(conv2d_transpose_grad_grad, ops::ConvTransposeOpDoubleGrad,
+                  Conv2dTranposeDoubleGradInferShapeFunctor);
 
 // conv3d_transpose
+DECLARE_INFER_SHAPE_FUNCTOR(conv3d_transpose, Conv3dTranposeInferShapeFunctor,
+                            PD_INFER_META(phi::ConvTransposeInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(conv3d_transpose_grad,
+                            Conv3dTranposeGradInferShapeFunctor,
+                            PD_INFER_META(phi::ConvTransposeGradInferMeta));
+
 REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp,
                   ops::Conv3DTransposeOpMaker,
                   ops::ConvTransposeGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ConvTransposeGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
-                                     double>);
+                  ops::ConvTransposeGradOpMaker<paddle::imperative::OpBase>,
+                  Conv3dTranposeInferShapeFunctor);
+REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad,
+                  Conv3dTranposeGradInferShapeFunctor);
 
 // depthwise conv2d_transpose
+DECLARE_INFER_SHAPE_FUNCTOR(depthwise_conv2d_transpose,
+                            DepthWiseConv2dTranposeInferShapeFunctor,
+                            PD_INFER_META(phi::ConvTransposeInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(depthwise_conv2d_transpose_grad,
+                            DepthWiseConv2dTranposeGradInferShapeFunctor,
+                            PD_INFER_META(phi::ConvTransposeGradInferMeta));
+
 REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp,
                   ops::Conv2DTransposeOpMaker,
                   ops::ConvTransposeGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ConvTransposeGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    depthwise_conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    depthwise_conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
-                                     double>);
+                  ops::ConvTransposeGradOpMaker<paddle::imperative::OpBase>,
+                  DepthWiseConv2dTranposeInferShapeFunctor);
+REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad,
+                  DepthWiseConv2dTranposeGradInferShapeFunctor);
 
 REGISTER_OP_VERSION(conv_transpose)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/conv_transpose_op.cu b/paddle/fluid/operators/conv_transpose_op.cu
deleted file mode 100644
index 054cb4b33895b..0000000000000
--- a/paddle/fluid/operators/conv_transpose_op.cu
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/conv_transpose_op.h"
-#include "paddle/phi/kernels/gpu/depthwise_conv.h"
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const std::string data_layout_str =
-        context.Attr<std::string>("data_format");
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const Tensor* input = context.Input<Tensor>("Input");
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-
-    int groups = context.Attr<int>("groups");
-    PADDLE_ENFORCE_EQ(
-        groups, filter.dims()[0],
-        platform::errors::InvalidArgument(
-            "groups should be error to the 1st dimension of filter. But "
-            "received groups is %d and filter dimension[0] is %d",
-            groups, filter.dims()[0]));
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    for (auto v : dilations) {
-      PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument(
-                                  "dilations should be 1 in depthwise conv. "
-                                  "But received dilations is %d",
-                                  v));
-    }
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-
-    framework::DDim in_data_dims;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    output->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, output, static_cast<T>(0));
-
-    math::DepthwiseConvInputGradFunctor<phi::GPUContext, T>
-        depthwiseConvInputGrad;
-    depthwiseConvInputGrad(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *output, filter, *input, strides,
-        std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
-        dilations, output, data_layout);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const std::string data_layout_str =
-        context.Attr<std::string>("data_format");
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    if (!input_grad && !filter_grad) return;
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-
-    framework::DDim in_data_dims;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    if (input_grad) {
-      math::DepthwiseConvFunctor<phi::GPUContext, T> depthwiseConv;
-      depthwiseConv(
-          static_cast<const typename framework::ConvertToPhiContext<
-              DeviceContext>::TYPE&>(dev_ctx),
-          *output_grad, filter, strides,
-          std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
-          dilations, input_grad, data_layout);
-    }
-
-    if (filter_grad) {
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      filter_grad->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-
-      math::DepthwiseConvFilterGradFunctor<phi::GPUContext, T>
-          depthwiseConvFilterGrad;
-      depthwiseConvFilterGrad(
-          static_cast<const typename framework::ConvertToPhiContext<
-              DeviceContext>::TYPE&>(dev_ctx),
-          *output_grad, *input, strides,
-          std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
-          dilations, filter_grad, data_layout);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-// conv2d
-REGISTER_OP_CUDA_KERNEL(conv2d_transpose,
-                        ops::GemmConvTransposeKernel<CUDA, float>,
-                        ops::GemmConvTransposeKernel<CUDA, double>);
-REGISTER_OP_CUDA_KERNEL(conv2d_transpose_grad,
-                        ops::GemmConvTransposeGradKernel<CUDA, float>,
-                        ops::GemmConvTransposeGradKernel<CUDA, double>);
-REGISTER_OP_CUDA_KERNEL(conv2d_transpose_grad_grad,
-                        ops::GemmConvTransposeGradKernel<CUDA, float>,
-                        ops::GemmConvTransposeGradKernel<CUDA, double>);
-
-// conv3d
-REGISTER_OP_CUDA_KERNEL(conv3d_transpose,
-                        ops::GemmConvTransposeKernel<CUDA, float>,
-                        ops::GemmConvTransposeKernel<CUDA, double>);
-REGISTER_OP_CUDA_KERNEL(conv3d_transpose_grad,
-                        ops::GemmConvTransposeGradKernel<CUDA, float>,
-                        ops::GemmConvTransposeGradKernel<CUDA, double>);
-
-// depthwise conv2d
-REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose,
-                        ops::DepthwiseConvTransposeKernel<CUDA, float>,
-                        ops::DepthwiseConvTransposeKernel<CUDA, double>);
-REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose_grad,
-                        ops::DepthwiseConvTransposeGradKernel<CUDA, float>,
-                        ops::DepthwiseConvTransposeGradKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
index ee0fb7ab36833..ac95dceb8280c 100644
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -13,72 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/conv_op.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/operators/math/vol2col.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-
-template <typename DeviceContext, typename T, size_t D>
-static void Slice(const framework::ExecutionContext& context,
-                  const Tensor* input, Tensor* out,
-                  const std::vector<int64_t>& begin_vec,
-                  const std::vector<int64_t>& end_vec,
-                  const std::vector<int64_t>& axes_vec) {
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
-  auto in_dims = input->dims();
-  auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
-  auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
-  for (size_t i = 0; i < D; ++i) {
-    offsets[i] = 0;
-    extents[i] = in_dims[i];
-  }
-
-  std::vector<int64_t> out_shape_vec = phi::vectorize(in_dims);
-  for (size_t i = 0; i < axes_vec.size(); ++i) {
-    offsets[axes_vec[i]] = begin_vec[i];
-    extents[axes_vec[i]] = end_vec[i] - begin_vec[i];
-    out_shape_vec[axes_vec[i]] = end_vec[i] - begin_vec[i];
-  }
-
-  framework::DDim out_dims(phi::make_ddim(out_shape_vec));
-  out->mutable_data<T>(out_dims, context.GetPlace());
-
-  auto in_t =
-      framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-          *input);
-  auto out_t =
-      framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-          *out, out_dims);
-
-  EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(place, out_t, in_t,
-                                                        offsets, extents);
-  out->Resize(out_dims);
-}
-
-template <typename DeviceContext, typename T, size_t D>
-static void Slice(const framework::ExecutionContext& context,
-                  const Tensor* input, Tensor* out, int64_t begin_idx,
-                  int64_t end_idx, int64_t axes) {
-  std::vector<int64_t> begin_vec = {begin_idx};
-  std::vector<int64_t> end_vec = {end_idx};
-  std::vector<int64_t> axes_vec = {axes};
-  Slice<DeviceContext, T, D>(context, input, out, begin_vec, end_vec, axes_vec);
-}
-
 // Define Op classes in .h file so that other conv transpose
 // operator implementations can reuse the code.
 class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -94,21 +36,19 @@ class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
 class ConvTransposeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override;
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override;
 
   framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
+      const std::string& var_name, const framework::Tensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override;
 };
 
 class ConvTransposeOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override;
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -118,464 +58,11 @@ class ConvTransposeOpGrad : public framework::OperatorWithKernel {
 class ConvTransposeOpDoubleGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override;
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override;
 };
 
-template <typename DeviceContext, typename T>
-class GemmConvTransposeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const std::string data_layout_str =
-        context.Attr<std::string>("data_format");
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const Tensor* input = context.Input<Tensor>("Input");
-    // The filter will be reshaped, so it should not be constant pointer
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    int groups = context.Attr<int>("groups");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-    auto out_dims = output->dims();
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    framework::DDim in_data_dims;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    // input_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first
-    // input_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last
-    std::vector<int64_t> input_shape_vec = phi::vectorize(input->dims());
-    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-    std::vector<int64_t> filter_shape_vec = phi::vectorize(filter.dims());
-
-    // use col_shape in the im2col and col2im (or vol2col and col2vol)
-    // calculation
-    // col_shape_vec: {o_c/g, k_h, k_w, h, w} or {o_c/g, k_d, k_h, k_w, d, h, w}
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    if (data_layout != framework::DataLayout::kNHWC) {
-      col_shape_vec[0] = out_dims[1] / groups;
-      for (size_t j = 0; j < data_dim; ++j) {
-        col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-        col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
-      }
-    } else {
-      col_shape_vec[0] = out_dims[out_dims.size() - 1] / groups;
-      for (size_t j = 0; j < data_dim; ++j) {
-        col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-        col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 1];
-      }
-    }
-    DDim col_shape(phi::make_ddim(col_shape_vec));
-
-    // use col_matrix_shape in the gemm calculation
-    // size: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d * k_h * k_w, d * h * w)
-    DDim col_matrix_shape = phi::flatten_to_2d(col_shape, data_dim + 1);
-
-    Tensor col;
-    col.mutable_data<T>(col_shape, context.GetPlace());
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix;
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-
-    // output size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first
-    // output size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last
-    DDim output_shape =
-        phi::slice_ddim(output->dims(), 1, output->dims().size());
-
-    // input matrix size: (i_c, h * w) or (i_c, d * h * w) for channel_first
-    // input matrix size: (h * w, i_c) or (d * h * w, i_c) for channel_last
-    DDim input_matrix_shape;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      input_matrix_shape = {in_dims[1], col_matrix_shape[1]};
-    } else {
-      input_matrix_shape = {col_matrix_shape[1], in_dims[in_dims.size() - 1]};
-    }
-
-    // filter size: (i_c, o_c/g * k_h * k_w) or (i_c, o_c/g * k_d * k_h * k_w)
-    DDim filter_matrix_shape;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      filter_matrix_shape = {in_dims[1], col_matrix_shape[0]};
-    } else {
-      filter_matrix_shape = {in_dims[in_dims.size() - 1], col_matrix_shape[0]};
-    }
-    filter.Resize(filter_matrix_shape);
-
-    output->mutable_data<T>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-    set_zero(dev_ctx, output, static_cast<T>(0));
-
-    int in_step =
-        (data_layout != framework::DataLayout::kNHWC
-             ? static_cast<int>(in_dims[1]) / groups
-             : static_cast<int>(in_dims[in_dims.size() - 1]) / groups);
-
-    int out_step =
-        (data_layout != framework::DataLayout::kNHWC
-             ? static_cast<int>(out_dims[1]) / groups
-             : static_cast<int>(out_dims[out_dims.size() - 1]) / groups);
-    math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
-    math::Col2VolFunctor<DeviceContext, T> col2vol;
-    math::ConcatFunctor<DeviceContext, T> concat_functor;
-
-    // convolution transpose: gemm + col2im or col2vol (similar to conv-backward
-    // on input)
-    size_t D = input->dims().size();
-    for (int i = 0; i < batch_size; i++) {
-      // batch with size (i_c, h * w) or (i_c, d * h * w) for channel_first
-      // batch with size (h * w, i_c) or (d * h * w, i_c) for channel_last
-      Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
-
-      // output size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first
-      // output size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last
-      Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
-
-      std::vector<Tensor> output_batch_vec;
-      for (int g = 0; g < groups; g++) {
-        int64_t start = g * in_step;
-        int64_t end = (g + 1) * in_step;
-        int axes = (data_layout != framework::DataLayout::kNHWC ? 0 : 1);
-        Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
-        Tensor in_slice, out_slice;
-
-        // col_matrix = filter_slice * input_slice
-        // of shape (o_c/g * k_h * k_w, h * w)
-        // or (o_c/g * k_d * k_h * k_w, d * h * w)
-        if (data_layout != framework::DataLayout::kNHWC) {
-          in_slice = input_batch.Slice(g * in_step, (g + 1) * in_step);
-          out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step);
-          blas.MatMul(filter_slice, true, in_slice, false, static_cast<T>(1.0),
-                      &col_matrix, static_cast<T>(0.0));
-        } else {
-          Slice<DeviceContext, T, 2>(context, &input_batch, &in_slice, start,
-                                     end, axes);
-          start = g * out_step;
-          end = (g + 1) * out_step;
-          axes = D - 2;
-          if (D == 4U) {
-            Slice<DeviceContext, T, 3>(context, &output_batch, &out_slice,
-                                       start, end, axes);
-          } else if (D == 5U) {
-            Slice<DeviceContext, T, 4>(context, &output_batch, &out_slice,
-                                       start, end, axes);
-          }
-          blas.MatMul(filter_slice, true, in_slice, true, static_cast<T>(1.0),
-                      &col_matrix, static_cast<T>(0.0));
-        }
-
-        if (data_dim == 2U) {
-          // col2im: col_matrix -> dy
-          // from (o_c/g * k_h * k_w, h * w) to (o_c/g, o_h, o_w) or (o_h, o_w,
-          // o_c/g)
-          col2im(dev_ctx, col, dilations, strides,
-                 std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                  paddings[3]},
-                 &out_slice, data_layout);
-        } else if (data_dim == 3U) {
-          // col2vol: col_matrix -> dy
-          // from (o_c/g * k_d * k_h * k_w, d * h * w) to (o_c/g, o_d, o_h, o_w)
-          // or (o_d, o_h, o_w, o_c/g)
-          col2vol(dev_ctx, col, dilations, strides, paddings, &out_slice,
-                  data_layout);
-        }
-        if (data_layout == framework::DataLayout::kNHWC) {
-          output_batch_vec.push_back(out_slice);
-        }
-      }
-      if (data_layout == framework::DataLayout::kNHWC) {
-        concat_functor(dev_ctx, output_batch_vec, static_cast<int>(D - 2),
-                       &output_batch);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const std::string data_layout_str =
-        context.Attr<std::string>("data_format");
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    // For filter, we do not use const pointer b/c we will do reshape,
-    // but we should avoid modifying its value.
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-
-    if ((!input_grad) && (!filter_grad)) return;
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    int groups = context.Attr<int>("groups");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-    auto out_grad_dims = output_grad->dims();
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    framework::DDim in_data_dims;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    // input_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first
-    // input_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last
-    std::vector<int64_t> input_shape_vec = phi::vectorize(input->dims());
-    // filter_shape_vec: {i_c, o_c, k_h, k_w} or {i_c, o_c, k_d, k_h, k_w}
-    std::vector<int64_t> filter_shape_vec = phi::vectorize(filter.dims());
-
-    // use col_shape in the im2col and col2im (or vol2col and col2vol)
-    // calculation
-    // col_shape_vec: {o_c, k_h, k_w, h, w} or {o_c, k_d, k_h, k_w, d, h, w} for
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    if (data_layout != framework::DataLayout::kNHWC) {
-      col_shape_vec[0] = out_grad_dims[1];
-      for (size_t j = 0; j < data_dim; ++j) {
-        col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-        col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
-      }
-    } else {
-      col_shape_vec[0] = out_grad_dims[out_grad_dims.size() - 1];
-      for (size_t j = 0; j < data_dim; ++j) {
-        col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-        col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 1];
-      }
-    }
-    DDim col_shape(phi::make_ddim(col_shape_vec));
-
-    // use col_matrix_shape in the gemm calculation
-    // size: (o_c * k_h * k_w, h * w) or (o_c * k_d * k_h * k_w, d * h * w)
-    DDim col_matrix_shape = phi::flatten_to_2d(col_shape, data_dim + 1);
-
-    // output size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first
-    // output size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last
-    DDim output_shape =
-        phi::slice_ddim(output_grad->dims(), 1, output_grad->dims().size());
-
-    // input matrix size: (i_c, h * w) or (i_c, d * h * w) for channel_first
-    // input matrix size: (h * w, i_c) or (d * h * w, i_c) for channel_last
-    DDim input_matrix_shape;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      input_matrix_shape = {in_dims[1], col_matrix_shape[1]};
-    } else {
-      input_matrix_shape = {col_matrix_shape[1], in_dims[in_dims.size() - 1]};
-    }
-
-    // filter size: (i_c, o_c/g * k_h * k_w) or (i_c, o_c/g * k_d * k_h * k_w)
-    DDim filter_matrix_shape;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      filter_matrix_shape = {in_dims[1], col_matrix_shape[0] / groups};
-    } else {
-      filter_matrix_shape = {in_dims[in_dims.size() - 1],
-                             col_matrix_shape[0] / groups};
-    }
-    filter.Resize(filter_matrix_shape);
-
-    int in_step =
-        (data_layout != framework::DataLayout::kNHWC
-             ? static_cast<int>(in_dims[1]) / groups
-             : static_cast<int>(in_dims[in_dims.size() - 1]) / groups);
-    int col_step = static_cast<int>(col_matrix_shape[0]) / groups;
-
-    // convolution transpose grad on input:
-    // im2col + gemm (similar to conv-forward)
-    // input need to compute gradient
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-    if (input_grad || filter_grad) {
-      Tensor col;
-      col.mutable_data<T>(col_shape, context.GetPlace());
-      // col_matrix shares the same piece of data with col,
-      // but will be reshaped into a two-dimensional matrix shape
-      // to call the matrix multiplication interface.
-      Tensor col_matrix;
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-
-      Tensor filter_grad_;
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-
-      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      math::ConcatFunctor<DeviceContext, T> concat_functor;
-
-      if (input_grad) {
-        input_grad->mutable_data<T>(context.GetPlace());
-        set_zero(dev_ctx, input_grad, static_cast<T>(0));
-      }
-      if (filter_grad) {  // filter_grad_ size (i_c, o_c/g, k_h, k_w)
-        filter_grad->mutable_data<T>(context.GetPlace());
-        set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-        filter_grad_ = *filter_grad;
-        filter_grad_.Resize(filter_matrix_shape);
-      }
-
-      size_t D = input->dims().size();
-      for (int i = 0; i < batch_size; i++) {
-        // batch with size (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for
-        // channel_first
-        // batch with size (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for
-        // channel_last
-        Tensor output_grad_batch =
-            output_grad->Slice(i, i + 1).Resize(output_shape);
-
-        if (data_dim == 2U) {
-          // im2col: dy -> col matrix
-          // from (o_c, o_h, o_w) to (o_c * k_h * k_w, i_h * i_w) for
-          // channel_first
-          // from (o_h, o_w, o_c) to (o_c * k_h * k_w, i_h * i_w) for
-          // channel_last
-          im2col(dev_ctx, output_grad_batch, dilations, strides,
-                 std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                  paddings[3]},
-                 &col, data_layout);
-        } else if (data_dim == 3U) {
-          // vol2col: dy -> col_matrix
-          // from (o_c, o_d, o_h, o_w) to (o_c * k_d * k_h * k_w, i_d * i_h *
-          // i_w) for channel_first
-          // from (o_d, o_h, o_w, o_c) to (i_d * i_h * i_w, o_c * k_d * k_h *
-          // k_w) for channel_last
-          vol2col(dev_ctx, output_grad_batch, dilations, strides, paddings,
-                  &col, data_layout);
-        }
-
-        if (input_grad) {
-          // batch with size (i_c, i_h, i_w) or (i_h, i_w, i_c)
-          Tensor input_grad_batch =
-              input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
-
-          // gemm: dx = filter * dy
-          // (i_c, o_c * k_h * k_w) * (o_c * k_h * k_w, i_h * i_w) -> (i_c, i_h
-          // * i_w)
-          // or
-          // (i_c, o_c * k_d * k_h * k_w) * (o_c * k_d * k_h * k_w, i_d * i_h *
-          // i_w) -> (i_c,
-          // i_d, i_h, i_w)
-          // gemm: dx = dy^T * filter^T for channel_last
-
-          std::vector<Tensor> input_grad_batch_vec;
-          for (int g = 0; g < groups; g++) {
-            // input_grad_slice: (i_c/g, i_h * i_w) or (i_c/g, i_d * i_h * i_w)
-            // for channel_first
-            // input_grad_slice: (i_h * i_w, i_c/g) or (i_d * i_h * i_w, i_c/g)
-            // for channel_last
-            // filter_slice: (i_c/g, o_c/g * k_h * k_w)
-            Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
-            // col_matrix_slice: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d *
-            // k_h * k_w, d * h * w)
-            Tensor col_matrix_slice =
-                col_matrix.Slice(g * col_step, (g + 1) * col_step);
-            if (data_layout != framework::DataLayout::kNHWC) {
-              Tensor input_grad_slice =
-                  input_grad_batch.Slice(g * in_step, (g + 1) * in_step);
-              blas.MatMul(filter_slice, false, col_matrix_slice, false,
-                          static_cast<T>(1.0), &input_grad_slice,
-                          static_cast<T>(0.0));
-            } else {
-              Tensor input_grad_slice;
-              Slice<DeviceContext, T, 2>(context, &input_grad_batch,
-                                         &input_grad_slice, g * in_step,
-                                         (g + 1) * in_step, 1);
-              blas.MatMul(col_matrix_slice, true, filter_slice, true,
-                          static_cast<T>(1.0), &input_grad_slice,
-                          static_cast<T>(0.0));
-              DDim input_grad_slice_shape;
-              if (data_dim == 2U) {
-                input_grad_slice_shape = {in_dims[1], in_dims[2], in_step};
-              } else {
-                input_grad_slice_shape = {in_dims[1], in_dims[2], in_dims[3],
-                                          in_step};
-              }
-              input_grad_slice =
-                  input_grad_slice.Resize(input_grad_slice_shape);
-              input_grad_batch_vec.push_back(input_grad_slice);
-            }
-          }
-          if (data_layout == framework::DataLayout::kNHWC) {
-            concat_functor(dev_ctx, input_grad_batch_vec,
-                           static_cast<int>(D - 2), &input_grad_batch);
-          }
-        }
-        if (filter_grad) {
-          // input batch: (i_c, i_h * i_w) or (i_h, i_w * i_c)
-          Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
-          // gemm: d_filter = x * dy^T
-          // (i_c, i_h * i_w) * (i_h * i_w, o_c * k_h * k_w) -> (i_c, o_c * k_h
-          // * k_w)
-          // or
-          // (i_c, i_d * i_h * i_w) * (i_d * i_h * i_w, o_c * k_d * k_h * k_w)
-          // -> (i_c, o_c * k_d *
-          // k_h * k_w)
-          // gemm: d_filter = x^T * dy^T for channel_last
-
-          for (int g = 0; g < groups; g++) {
-            Tensor filter_grad_slice =
-                filter_grad_.Slice(g * in_step, (g + 1) * in_step);
-            Tensor col_matrix_slice =
-                col_matrix.Slice(g * col_step, (g + 1) * col_step);
-            if (data_layout != framework::DataLayout::kNHWC) {
-              Tensor in_batch_slice =
-                  in_batch.Slice(g * in_step, (g + 1) * in_step);
-              blas.MatMul(in_batch_slice, false, col_matrix_slice, true,
-                          static_cast<T>(1.0), &filter_grad_slice,
-                          static_cast<T>(1.0));
-            } else {
-              Tensor in_batch_slice;
-              Slice<DeviceContext, T, 2>(context, &in_batch, &in_batch_slice,
-                                         g * in_step, (g + 1) * in_step, 1);
-              blas.MatMul(in_batch_slice, true, col_matrix_slice, true,
-                          static_cast<T>(1.0), &filter_grad_slice,
-                          static_cast<T>(1.0));
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/conv_transpose_op_npu.cc b/paddle/fluid/operators/conv_transpose_op_npu.cc
index 7d0ebf21829c2..050ede78f72cf 100644
--- a/paddle/fluid/operators/conv_transpose_op_npu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_npu.cc
@@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/conv_transpose_op.h"
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
@@ -55,8 +59,8 @@ class Conv2DTransposeNPUKernel : public framework::OpKernel<T> {
     filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
 
     std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&padding, &dilation, padding_algorithm,
-                             in_data_dims, stride, ksize);
+    phi::UpdatePaddingAndDilation(&padding, &dilation, padding_algorithm,
+                                  in_data_dims, stride, ksize);
 
     // construct NPU attr
     std::vector<int> strides(4, 1);
@@ -137,8 +141,8 @@ class Conv2DTransposeGradNPUKernel : public framework::OpKernel<T> {
     framework::DDim filter_data_dims =
         phi::slice_ddim(filter_dims, 2, filter_dims.size());
     std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
+    phi::UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                                  in_data_dims, strides, ksize);
 
     std::vector<int> strides_vec(4, 1);
     std::vector<int> dilations_vec(4, 1);
diff --git a/paddle/fluid/operators/conv_transpose_op_xpu.cc b/paddle/fluid/operators/conv_transpose_op_xpu.cc
index 12e1739f2a267..b8bd3c4f00608 100644
--- a/paddle/fluid/operators/conv_transpose_op_xpu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_xpu.cc
@@ -8,15 +8,22 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #include "paddle/fluid/operators/conv_transpose_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+
 #ifdef PADDLE_WITH_XPU
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 // target_len == 2 || target_len == 4
 inline std::vector<int> vector_extend(const std::vector<int>& src,
                                       int target_len) {
@@ -61,8 +68,8 @@ class Conv2DTransposeXPUKernel : public framework::OpKernel<T> {
     framework::DDim filter_data_dims =
         phi::slice_ddim(filter.dims(), 2, filter.dims().size());
     std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
+    phi::UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                                  in_data_dims, strides, ksize);
 
     const int batch_size = static_cast<int>(input->dims()[0]);
     const int img_yc = static_cast<int>(input->dims()[1]);
@@ -135,8 +142,8 @@ class Conv2DTransposeGradXPUKernel : public framework::OpKernel<T> {
     framework::DDim filter_data_dims =
         phi::slice_ddim(filter.dims(), 2, filter.dims().size());
     std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
+    phi::UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                                  in_data_dims, strides, ksize);
 
     const int batch_size = static_cast<int>(input->dims()[0]);
     const int img_yc = static_cast<int>(input->dims()[1]);
diff --git a/paddle/fluid/operators/cumprod_op.cc b/paddle/fluid/operators/cumprod_op.cc
index bff6673429d9a..889cdac8f6882 100644
--- a/paddle/fluid/operators/cumprod_op.cc
+++ b/paddle/fluid/operators/cumprod_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/cumprod_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,14 +23,6 @@ namespace operators {
 class CumprodOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Cumprod");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Cumprod");
-
-    ctx->ShareDim("X", "Out");
-    ctx->ShareLoD("X", "Out");
-  }
 };
 
 class CumprodOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -81,22 +76,12 @@ class CumprodGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(cumprod, CumprodInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 REGISTER_OPERATOR(cumprod, ops::CumprodOp, ops::CumprodOpMaker,
                   ops::CumprodGradOpMaker<paddle::framework::OpDesc>,
-                  ops::CumprodGradOpMaker<paddle::imperative::OpBase>);
+                  ops::CumprodGradOpMaker<paddle::imperative::OpBase>,
+                  CumprodInferShapeFunctor);
 
 REGISTER_OPERATOR(cumprod_grad, ops::CumprodGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    cumprod, ops::CumprodOpCPUKernel<float>, ops::CumprodOpCPUKernel<double>,
-    ops::CumprodOpCPUKernel<int>, ops::CumprodOpCPUKernel<int64_t>,
-    ops::CumprodOpCPUKernel<paddle::platform::complex<float>>,
-    ops::CumprodOpCPUKernel<paddle::platform::complex<double>>);
-
-REGISTER_OP_CPU_KERNEL(
-    cumprod_grad, ops::CumprodGradOpCPUKernel<float>,
-    ops::CumprodGradOpCPUKernel<double>, ops::CumprodGradOpCPUKernel<int>,
-    ops::CumprodGradOpCPUKernel<int64_t>,
-    ops::CumprodGradOpCPUKernel<paddle::platform::complex<float>>,
-    ops::CumprodGradOpCPUKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/cumprod_op.cu b/paddle/fluid/operators/cumprod_op.cu
deleted file mode 100644
index f792d6832917f..0000000000000
--- a/paddle/fluid/operators/cumprod_op.cu
+++ /dev/null
@@ -1,369 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <thrust/transform.h>
-#include "paddle/fluid/operators/cumprod_op.h"
-#include "paddle/fluid/operators/math/inclusive_scan.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct MultiplyFunctor {
-  HOSTDEVICE T operator()(T a, T b) const { return a * b; }
-};
-
-template <typename T>
-class CumprodOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<framework::Tensor>("X");
-    auto *y = ctx.Output<framework::Tensor>("Out");
-    auto dim = ctx.Attr<int>("dim");
-    size_t outer_dim, mid_dim, inner_dim;
-    GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim);
-
-    const auto *x_data = x->data<T>();
-    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
-    const auto &dev_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    math::InclusiveScan<T, MultiplyFunctor<T>>(
-        x_data, y_data, outer_dim, mid_dim, inner_dim, static_cast<T>(1),
-        MultiplyFunctor<T>(), /*reverse=*/false, dev_ctx);
-  }
-};
-
-template <typename T>
-struct IsZeroFunctor {
-  HOSTDEVICE bool operator()(T x) const { return x == static_cast<T>(0); }
-};
-
-template <typename T>
-struct CumprodGradFunctorExceptFirstZero {
-  HOSTDEVICE CumprodGradFunctorExceptFirstZero(
-      const T *x, const T *y, const T *dy_mul_y_reversed_cumsum,
-      const uint8_t *zero_mask, size_t mid_dim, size_t inner_dim, T *dx,
-      int64_t *first_zero_idx, T *x_filled_one)
-      : x_(x),
-        y_(y),
-        dy_mul_y_reversed_cumsum_(dy_mul_y_reversed_cumsum),
-        zero_mask_(zero_mask),
-        mid_dim_(mid_dim),
-        inner_dim_(inner_dim),
-        dx_(dx),
-        first_zero_idx_(first_zero_idx),
-        x_filled_one_(x_filled_one) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    auto inner_idx = idx % inner_dim_;
-    auto outer_idx = idx / (mid_dim_ * inner_dim_);
-    auto mid_idx = (idx - inner_idx) / inner_dim_ % mid_dim_;
-    auto mask = zero_mask_[idx];
-    bool should_fill_one = true;
-
-    if (mask == 0) {
-      dx_[idx] = dy_mul_y_reversed_cumsum_[idx] / x_[idx];
-      if (mid_idx == mid_dim_ - 1) {
-        // record first zero position as -1, i.e., no zero
-        first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = -1;
-      }
-    } else if (mid_idx > 0) {                  // mask > 0
-      if (zero_mask_[idx - inner_dim_] > 0) {  // not first zero
-        dx_[idx] = 0;
-        should_fill_one = false;
-      } else {
-        // idx is the first zero position, it should be recorded
-        dx_[idx] = y_[idx - inner_dim_];
-        first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = mid_idx;
-      }
-    } else {  // the first zero position is index 0
-      dx_[idx] = 1;
-      first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = 0;
-    }
-
-    x_filled_one_[idx] = should_fill_one ? 1 : x_[idx];
-  }
-
- private:
-  const T *x_;
-  const T *y_;
-  const T *dy_mul_y_reversed_cumsum_;
-  const uint8_t *zero_mask_;
-  size_t mid_dim_;
-  size_t inner_dim_;
-  T *dx_;
-  int64_t *first_zero_idx_;
-  T *x_filled_one_;
-};
-
-template <typename T>
-struct FillFirstZeroPositionGradFunctor {
-  HOSTDEVICE FillFirstZeroPositionGradFunctor(const int64_t *first_zero_idx,
-                                              const T *grad_value,
-                                              size_t mid_dim, size_t inner_dim,
-                                              T *dx)
-      : first_zero_idx_(first_zero_idx),
-        grad_value_(grad_value),
-        mid_dim_(mid_dim),
-        inner_dim_(inner_dim),
-        dx_(dx) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    auto outer_idx = idx / inner_dim_;
-    auto inner_idx = idx % inner_dim_;
-    auto mid_idx = first_zero_idx_[idx];
-    if (mid_idx >= 0) {
-      auto full_idx =
-          outer_idx * mid_dim_ * inner_dim_ + mid_idx * inner_dim_ + inner_idx;
-      dx_[full_idx] *= grad_value_[full_idx];
-    }
-  }
-
- private:
-  const int64_t *first_zero_idx_;
-  const T *grad_value_;
-  size_t mid_dim_;
-  size_t inner_dim_;
-  T *dx_;
-};
-
-/*
-Reference to
-https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ReduceOps.cpp
-input: x, y, dL/dy
-output: dL/dx
-dL/dx[i] = sum{0<=j<n} (dL/dy[j])*(dy[j]/dx[i]) (1)
-         = sum(0<=j<n} (dL/dy[j])*(d(x[0]*x[1]*...*x[j])/dx[i])
-if x[i] != 0, dL/dx[i] = sum{i<=j<n} (dL/dy[j])*(y[j]/x[i]) (2)
-if x[i] == 0, the formula(2) can not be applied directly.
-Suppose k is the first index of zero element, the formula will be:
-i > k, dL/dx[i] = 0;
-i < k, dL/dx[i] = 1/x[i]*sum{i<=j<n} (dL/dy[j]*y[j])
-i = k, dL/dx[i] = y[i-1]*sum{i<=j<n} (dL/dy[j])*(x[i+1]*...*x[j])
-
-First, we will show the main resolution.
-We need to judge the relationship between i (current index) and k (index
-which corresponds to the first element of 0).
-To mark the relationship, we now introduce zero_mask and we also need to
-mark the index of the first zero element.
-zero_mask = cummax(x[i] == 0);      //label whether x[i]==0 until the index.
-zero_index = -1;                    //store the first zero element's index.
-e.g. x = [1, 4, 5, 0, 2, 3, 0];
-     zero_mask = [0, 0, 0, 1, 1, 1, 1];
-     zero_index = 3;
-When i < k, we need to calculate the result of sum{i<=j<n}(d_y[j]*y[j]), we can
-use reversed cumsum to calculate it.
-R = reversed_cumsum(dy[j]*y[j]);     //store the calculation result of the
-sum{i<=j<n}(d_y[j]*y[j]) and x[k+1],x[k+2],...,x[j] along the index k+1 ~ j.
-When i = k, we need to calculate the result of prod{i<w<j}(x[w]).
-To calculate it, we introduce x_filled_one, which fill 1 before x[k+1] along
-the index 0 ~ k.
-e.g. x = [1, 4, 5, 0, 2, 3, 0];
-     x_filled_one = [1, 1, 1, 1, 2, 3, 0];
-Thus, we can use cumprod(x_filled_one[j]) to calculate the result of
-prod{k<=w<j}(x[w]).
-
-Then, we will show more detailed implementation.
-for (int i = 0; i < numel; i++) {
-    if (zero_mask[i] == 0) {       //case i < k
-        dx[i] = R[i] / x[i];
-        x_filled_one[i] = 1;
-    } else {
-        if (i == 0) {              //case i = k
-            dx[i] = 1;
-            zero_index = i;
-            x_filled_one[i] = 1;
-        } else {
-            if (zero_mask[i-1] == 0) {    //case i = k
-                dx[i] = y[i-1];
-                zero_index = i;
-                x_filled_one[i] = 1;
-            } else {                  //case i > k
-                dx[i] = 0;
-                x_filled_one[i] = x[i];
-            }
-        }
-    }
-}
-T = reversed_cumsum(dy[j]*cumprod(x_filled_one[j]));
-if (zero_index != -1) {
-    dx[zero_index] *= T[zero_index];
-}
-*/
-
-template <typename T>
-class CumprodGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<framework::Tensor>("X");
-    const auto *y = ctx.Input<framework::Tensor>("Out");
-    const auto *dy =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto dim = ctx.Attr<int>("dim");
-
-    size_t outer_dim, mid_dim, inner_dim;
-    GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim);
-    if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return;
-
-    size_t numel = outer_dim * mid_dim * inner_dim;
-
-    const auto *x_data = x->data<T>();
-    const auto *y_data = y->data<T>();
-    const auto *dy_data = dy->data<T>();
-
-    auto place = ctx.GetPlace();
-    const auto &dev_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    auto *dx_data = dx->mutable_data<T>(place);
-
-    // deal with complex
-    const T *x_data_deal;
-    const T *y_data_deal;
-    memory::AllocationPtr x_conj;
-    memory::AllocationPtr y_conj;
-    if (framework::IsComplex<T>::value) {
-      x_conj = memory::Alloc(place, numel * sizeof(T));
-      auto *x_data_conj = reinterpret_cast<T *>(x_conj->ptr());
-      y_conj = memory::Alloc(place, numel * sizeof(T));
-      auto *y_data_conj = reinterpret_cast<T *>(y_conj->ptr());
-
-      platform::ForRange<platform::CUDADeviceContext> for_range_x(dev_ctx,
-                                                                  numel);
-      phi::funcs::ConjFunctor<T> functor_x(x_data, numel, x_data_conj);
-      for_range_x(functor_x);
-
-      platform::ForRange<platform::CUDADeviceContext> for_range_y(dev_ctx,
-                                                                  numel);
-      phi::funcs::ConjFunctor<T> functor_y(y_data, numel, y_data_conj);
-      for_range_y(functor_y);
-      x_data_deal = x_data_conj;
-      y_data_deal = y_data_conj;
-    } else {
-      x_data_deal = x_data;
-      y_data_deal = y_data;
-    }
-
-// Step 1: find cummax-ed zero mask of x
-#ifdef PADDLE_WITH_CUDA
-    const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream());
-#else
-    const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream());
-#endif
-    auto zero_mask_without_cummax =
-        memory::Alloc(place, numel * sizeof(uint8_t));
-    auto *zero_mask_without_cummax_data =
-        reinterpret_cast<uint8_t *>(zero_mask_without_cummax->ptr());
-    thrust::transform(
-        exec_policy, thrust::device_pointer_cast(x_data_deal),
-        thrust::device_pointer_cast(x_data_deal) + numel,
-        thrust::device_pointer_cast(zero_mask_without_cummax_data),
-        IsZeroFunctor<T>());
-
-    auto zero_mask = memory::Alloc(place, numel * sizeof(uint8_t));
-    auto *zero_mask_data = reinterpret_cast<uint8_t *>(zero_mask->ptr());
-    math::InclusiveScan<uint8_t, cub::Max>(
-        zero_mask_without_cummax_data, zero_mask_data, outer_dim, mid_dim,
-        inner_dim, static_cast<uint8_t>(0), cub::Max(), /*reverse=*/false,
-        dev_ctx);
-    zero_mask_without_cummax = nullptr;
-
-    // Step 2: calculate reversed cumsum(dy * y)
-    auto dy_mul_y = memory::Alloc(place, numel * sizeof(T));
-    auto *dy_mul_y_data = reinterpret_cast<T *>(dy_mul_y->ptr());
-    thrust::transform(exec_policy, thrust::device_pointer_cast(dy_data),
-                      thrust::device_pointer_cast(dy_data) + numel,
-                      thrust::device_pointer_cast(y_data_deal),
-                      thrust::device_pointer_cast(dy_mul_y_data),
-                      MultiplyFunctor<T>());
-
-    auto dy_mul_y_reversed_cumsum = memory::Alloc(place, numel * sizeof(T));
-    auto *dy_mul_y_reversed_cumsum_data =
-        reinterpret_cast<T *>(dy_mul_y_reversed_cumsum->ptr());
-    math::InclusiveScan<T, cub::Sum>(
-        dy_mul_y_data, dy_mul_y_reversed_cumsum_data, outer_dim, mid_dim,
-        inner_dim, static_cast<T>(0), cub::Sum(), /*reverse=*/true, dev_ctx);
-
-    // Step 3: calculate the gradient value except the first zero position.
-    // The gradient value of the first zero position is filled with out[idx-1],
-    // while the gradient value of the other positions are calculated out
-    // completely. This functor also:
-    //  (1) find the first zero index, i.e., first_zero_idx_data.
-    //  (2) fill x_filled_one, which satifies
-    //      x_filled_one[i] = x[i], i > pos
-    //      x_filled_one[i] = 1, i <= pos
-    auto first_zero_idx =
-        memory::Alloc(place, outer_dim * inner_dim * sizeof(int64_t));
-    auto *first_zero_idx_data =
-        reinterpret_cast<int64_t *>(first_zero_idx->ptr());
-    auto *x_filled_one_data = dy_mul_y_data;  // reuse former allocated memory
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, numel);
-    CumprodGradFunctorExceptFirstZero<T> functor_except_first_zero(
-        x_data_deal, y_data_deal, dy_mul_y_reversed_cumsum_data, zero_mask_data,
-        mid_dim, inner_dim, dx_data, first_zero_idx_data, x_filled_one_data);
-    for_range(functor_except_first_zero);
-
-    // Step 4: calculate cumprod of x_filled_one
-    auto *x_filled_one_cumprod_data =
-        dy_mul_y_reversed_cumsum_data;  // reuse former allocated memory
-    math::InclusiveScan<T, MultiplyFunctor<T>>(
-        x_filled_one_data, x_filled_one_cumprod_data, outer_dim, mid_dim,
-        inner_dim, static_cast<T>(1), MultiplyFunctor<T>(), /*reverse=*/false,
-        dev_ctx);
-
-    // Step 5: calculate reversed cumsum(dy * x_filled_one_cumprod)
-    auto *dy_mul_x_filled_one_cumprod =
-        dy_mul_y_data;  // reuse former allocated memory
-    thrust::transform(exec_policy, thrust::device_pointer_cast(dy_data),
-                      thrust::device_pointer_cast(dy_data) + numel,
-                      thrust::device_pointer_cast(x_filled_one_cumprod_data),
-                      thrust::device_pointer_cast(dy_mul_x_filled_one_cumprod),
-                      MultiplyFunctor<T>());
-    auto *dy_mul_x_filled_one_cumprod_reversed_cumsum =
-        dy_mul_y_reversed_cumsum_data;  // reuse former allocated memory
-    math::InclusiveScan<T, cub::Sum>(
-        dy_mul_x_filled_one_cumprod,
-        dy_mul_x_filled_one_cumprod_reversed_cumsum, outer_dim, mid_dim,
-        inner_dim, static_cast<T>(0), cub::Sum(),
-        /*reverse=*/true, dev_ctx);
-
-    // Step 6: fill zero pos gradient value
-    platform::ForRange<platform::CUDADeviceContext>
-        for_range_fill_zero_pos_grad(dev_ctx, outer_dim * inner_dim);
-    FillFirstZeroPositionGradFunctor<T> fill_first_zero_pos_grad_functor(
-        first_zero_idx_data, dy_mul_x_filled_one_cumprod_reversed_cumsum,
-        mid_dim, inner_dim, dx_data);
-    for_range_fill_zero_pos_grad(fill_first_zero_pos_grad_functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    cumprod, ops::CumprodOpCUDAKernel<float>, ops::CumprodOpCUDAKernel<double>,
-    ops::CumprodOpCUDAKernel<int>, ops::CumprodOpCUDAKernel<int64_t>,
-    ops::CumprodOpCUDAKernel<paddle::platform::complex<float>>,
-    ops::CumprodOpCUDAKernel<paddle::platform::complex<double>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    cumprod_grad, ops::CumprodGradOpCUDAKernel<float>,
-    ops::CumprodGradOpCUDAKernel<double>, ops::CumprodGradOpCUDAKernel<int>,
-    ops::CumprodGradOpCUDAKernel<int64_t>,
-    ops::CumprodGradOpCUDAKernel<paddle::platform::complex<float>>,
-    ops::CumprodGradOpCUDAKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/cumprod_op.h b/paddle/fluid/operators/cumprod_op.h
deleted file mode 100644
index 74ed2008ae983..0000000000000
--- a/paddle/fluid/operators/cumprod_op.h
+++ /dev/null
@@ -1,170 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <type_traits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-
-static void GetCumprodDimInfo(const framework::DDim& dim, int cumprod_dim,
-                              size_t* outer_dim, size_t* mid_dim,
-                              size_t* inner_dim) {
-  PADDLE_ENFORCE_GE(
-      cumprod_dim, -dim.size(),
-      platform::errors::InvalidArgument(
-          "The input dim of CumprodOp should be larger than the opposite "
-          "rank of input x which is %d.But received dim=%d",
-          -dim.size(), cumprod_dim));
-  PADDLE_ENFORCE_LT(cumprod_dim, dim.size(),
-                    platform::errors::InvalidArgument(
-                        "The input dim of CumprodOp should be smaller than the "
-                        "rank of input x which is %d.But received dim=%d",
-                        dim.size(), cumprod_dim));
-  if (cumprod_dim < 0) cumprod_dim += dim.size();
-
-  *outer_dim = 1;
-  for (int i = 0; i < cumprod_dim; ++i) {
-    *outer_dim *= dim[i];
-  }
-  *mid_dim = dim[cumprod_dim];
-  *inner_dim = 1;
-  for (int i = cumprod_dim + 1; i < dim.size(); ++i) {
-    *inner_dim *= dim[i];
-  }
-}
-
-template <typename T>
-class CumprodOpCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-    int dim = context.Attr<int>("dim");
-
-    auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    framework::DDim shape = x->dims();
-
-    size_t outer_dim = 1;
-    size_t mid_dim = 1;
-    size_t inner_dim = 1;
-    GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim);
-
-    for (size_t i = 0; i < outer_dim; i++) {
-      for (size_t j = 0; j < mid_dim; j++) {
-        for (size_t k = 0; k < inner_dim; k++) {
-          size_t pos = i * mid_dim * inner_dim + j * inner_dim + k;
-          if (j == 0) {
-            out_data[pos] = x_data[pos];
-          } else {
-            out_data[pos] = out_data[pos - inner_dim] * x_data[pos];
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class CumprodGradOpCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const {
-    const Tensor* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    const Tensor* x = context.Input<Tensor>("X");
-    const Tensor* out = context.Input<Tensor>("Out");
-
-    int dim = context.Attr<int>("dim");
-    framework::DDim shape = x->dims();
-    Tensor* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-
-    auto* d_out_data = d_out->data<T>();
-    auto* x_data = x->data<T>();
-    auto* out_data = out->data<T>();
-    auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
-
-    auto place = context.GetPlace();
-    const auto& dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
-
-    size_t outer_dim = 1;
-    size_t mid_dim = 1;
-    size_t inner_dim = 1;
-    GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim);
-    size_t numel = outer_dim * mid_dim * inner_dim;
-
-    // deal with complex
-    const T* x_data_deal;
-    const T* out_data_deal;
-    memory::AllocationPtr x_conj;
-    memory::AllocationPtr out_conj;
-    if (framework::IsComplex<T>::value) {
-      x_conj = memory::Alloc(place, numel * sizeof(T));
-      auto* x_data_conj = reinterpret_cast<T*>(x_conj->ptr());
-      out_conj = memory::Alloc(place, numel * sizeof(T));
-      auto* out_data_conj = reinterpret_cast<T*>(out_conj->ptr());
-
-      platform::ForRange<platform::CPUDeviceContext> for_range_x(dev_ctx,
-                                                                 numel);
-      phi::funcs::ConjFunctor<T> functor_x(x_data, numel, x_data_conj);
-      for_range_x(functor_x);
-
-      platform::ForRange<platform::CPUDeviceContext> for_range_out(dev_ctx,
-                                                                   numel);
-      phi::funcs::ConjFunctor<T> functor_out(out_data, numel, out_data_conj);
-      for_range_out(functor_out);
-
-      x_data_deal = x_data_conj;
-      out_data_deal = out_data_conj;
-    } else {
-      x_data_deal = x_data;
-      out_data_deal = out_data;
-    }
-
-    for (size_t i = 0; i < outer_dim; i++) {
-      for (size_t k = 0; k < inner_dim; k++) {
-        for (size_t j = 0; j < mid_dim; j++) {
-          size_t index = i * mid_dim * inner_dim + j * inner_dim + k;
-          d_x_data[index] = 0;
-          for (size_t n = 0; n < mid_dim; n++) {
-            size_t pos = i * mid_dim * inner_dim + n * inner_dim + k;
-            T elem;
-            if (j == 0) {
-              elem = d_out_data[pos];
-            } else {
-              elem = d_out_data[pos] * out_data_deal[index - inner_dim];
-            }
-            if (pos > index) {
-              for (size_t m = index + inner_dim; m <= pos; m += inner_dim) {
-                elem *= x_data_deal[m];
-              }
-            } else if (pos < index) {
-              elem = static_cast<T>(0);
-            }
-            d_x_data[index] += elem;
-          }
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/deformable_conv_op.cc b/paddle/fluid/operators/deformable_conv_op.cc
index b15efc5f84bdd..6e15fd090b8c4 100644
--- a/paddle/fluid/operators/deformable_conv_op.cc
+++ b/paddle/fluid/operators/deformable_conv_op.cc
@@ -338,8 +338,6 @@ REGISTER_OPERATOR(deformable_conv, ops::DeformableConvOp,
 
 REGISTER_OPERATOR(deformable_conv_grad, ops::DeformableConvGradOp);
 
-REGISTER_OP_CPU_KERNEL(deformable_conv, ops::DeformableConvCPUKernel<float>,
-                       ops::DeformableConvCPUKernel<double>);
 REGISTER_OP_CPU_KERNEL(deformable_conv_grad,
                        ops::DeformableConvGradCPUKernel<float>,
                        ops::DeformableConvGradCPUKernel<double>);
diff --git a/paddle/fluid/operators/deformable_conv_op.cu b/paddle/fluid/operators/deformable_conv_op.cu
index 2c7d905c79b37..ad10abf9c647b 100644
--- a/paddle/fluid/operators/deformable_conv_op.cu
+++ b/paddle/fluid/operators/deformable_conv_op.cu
@@ -446,108 +446,6 @@ __global__ void FilterGradAddupGpuKernel(const int nthreads, const int n,
   }
 }
 
-template <typename DeviceContext, typename T>
-class DeformableConvCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const Tensor offset = *ctx.Input<Tensor>("Offset");
-    const Tensor mask = *ctx.Input<Tensor>("Mask");
-    Tensor filter = *ctx.Input<Tensor>("Filter");
-    Tensor* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.cuda_device_context();
-
-    const int groups = ctx.Attr<int>("groups");
-    const int deformable_groups = ctx.Attr<int>("deformable_groups");
-    const int im2col_step = ctx.Attr<int>("im2col_step");
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    const std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    const std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
-    std::vector<int64_t> output_shape_vec(phi::vectorize(output->dims()));
-
-    // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w}
-    std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
-    col_buffer_shape_vec[0] =
-        input->dims()[1] * filter.dims()[2] * filter.dims()[3];
-    col_buffer_shape_vec[1] = im2col_step;
-    for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
-      col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(phi::make_ddim(col_buffer_shape_vec));
-    std::vector<int64_t> output_buffer_shape_vec(1);
-    output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
-                                 output_shape_vec[2] * output_shape_vec[3];
-    framework::DDim output_shape(phi::make_ddim(output_buffer_shape_vec));
-    Tensor col_buffer;
-    Tensor output_buffer;
-    col_buffer = ctx.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-    output_buffer =
-        ctx.AllocateTmpTensor<T, DeviceContext>(output_shape, dev_ctx);
-
-    int64_t M = output_shape_vec[1] / groups;
-    int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
-    int64_t K =
-        input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups;
-
-    Tensor weight_3d;
-    weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K}));
-    Tensor col_buffer_3d;
-    col_buffer_3d.ShareDataWith(col_buffer)
-        .Resize(phi::make_ddim({groups, K, N}));
-    Tensor output_4d;
-    output_4d.ShareDataWith(output_buffer)
-        .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N}));
-    output_4d.mutable_data<T>(ctx.GetPlace());
-    framework::DDim input_shape =
-        phi::slice_ddim(input->dims(), 1, input->dims().size());
-    std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
-
-    int input_dim = input->numel() / input->dims()[0];
-    int input_offset_dim = offset.numel() / offset.dims()[0];
-    int input_mask_dim = mask.numel() / mask.dims()[0];
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-    const T* input_ptr = input->data<T>();
-    const T* offset_ptr = offset.data<T>();
-    const T* mask_ptr = mask.data<T>();
-    col_buffer.mutable_data<T>(ctx.GetPlace());
-    T* col_buffer_ptr = col_buffer.data<T>();
-
-    for (int i = 0; i < batch_size / im2col_step; ++i) {
-      ModulatedDeformableIm2col(
-          ctx.device_context(), input_ptr + i * im2col_step * input_dim,
-          offset_ptr + i * im2col_step * input_offset_dim,
-          mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec,
-          col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations,
-          deformable_groups, col_buffer_ptr);
-
-      Tensor output_3d = output_4d.Slice(i, i + 1).Resize(
-          phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size()));
-      for (int g = 0; g < groups; ++g) {
-        Tensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
-            phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
-        Tensor col_buffer_3d_slice =
-            col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
-                col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-        Tensor output_3d_slice = output_3d.Slice(g, g + 1).Resize(
-            phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size()));
-
-        blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0),
-                    &output_3d_slice, T(0.0));
-      }
-    }
-    output->ShareDataWith(output_buffer)
-        .Resize(phi::make_ddim(output_shape_vec));
-  }
-};
-
 template <typename DeviceContext, typename T>
 class DeformableConvGradCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -740,9 +638,6 @@ class DeformableConvGradCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
 
-REGISTER_OP_CUDA_KERNEL(deformable_conv,
-                        ops::DeformableConvCUDAKernel<CUDA, float>,
-                        ops::DeformableConvCUDAKernel<CUDA, double>);
 REGISTER_OP_CUDA_KERNEL(deformable_conv_grad,
                         ops::DeformableConvGradCUDAKernel<CUDA, float>,
                         ops::DeformableConvGradCUDAKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/deformable_conv_op.h b/paddle/fluid/operators/deformable_conv_op.h
index 66961655ee6ff..1176b96987ed6 100644
--- a/paddle/fluid/operators/deformable_conv_op.h
+++ b/paddle/fluid/operators/deformable_conv_op.h
@@ -318,102 +318,6 @@ void FilterGradAddupCPUKernel(const int nthreads, const int n, const int height,
   }
 }
 
-template <typename T>
-class DeformableConvCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* offset = ctx.Input<Tensor>("Offset");
-    auto* mask = ctx.Input<Tensor>("Mask");
-    Tensor filter = *ctx.Input<Tensor>("Filter");
-    Tensor* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<CPUDeviceContext>();
-
-    const int groups = ctx.Attr<int>("groups");
-    const int deformable_groups = ctx.Attr<int>("deformable_groups");
-    const int im2col_step = ctx.Attr<int>("im2col_step");
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    const std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    const std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
-    std::vector<int64_t> output_shape_vec(phi::vectorize(output->dims()));
-
-    // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w}
-    std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
-    col_buffer_shape_vec[0] =
-        input->dims()[1] * filter.dims()[2] * filter.dims()[3];
-    col_buffer_shape_vec[1] = im2col_step;
-    for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
-      col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(phi::make_ddim(col_buffer_shape_vec));
-    std::vector<int64_t> output_buffer_shape_vec(1);
-    output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
-                                 output_shape_vec[2] * output_shape_vec[3];
-    framework::DDim output_shape(phi::make_ddim(output_buffer_shape_vec));
-    Tensor col_buffer;
-    Tensor output_buffer;
-    col_buffer = ctx.AllocateTmpTensor<T, CPUDeviceContext>(col_shape, dev_ctx);
-    output_buffer =
-        ctx.AllocateTmpTensor<T, CPUDeviceContext>(output_shape, dev_ctx);
-    int64_t M = output_shape_vec[1] / groups;
-    int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
-    int64_t K =
-        input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups;
-
-    Tensor weight_3d;
-    weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K}));
-    Tensor col_buffer_3d;
-    col_buffer_3d.ShareDataWith(col_buffer)
-        .Resize(phi::make_ddim({groups, K, N}));
-    Tensor output_4d;
-    output_4d.ShareDataWith(output_buffer)
-        .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N}));
-    output_4d.mutable_data<T>(ctx.GetPlace());
-    framework::DDim input_shape =
-        phi::slice_ddim(input->dims(), 1, input->dims().size());
-    std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
-    int input_dim = input->numel() / input->dims()[0];
-    int input_offset_dim = offset->numel() / offset->dims()[0];
-    int input_mask_dim = mask->numel() / mask->dims()[0];
-    auto blas = phi::funcs::GetBlas<CPUDeviceContext, T>(dev_ctx);
-    const T* input_ptr = input->data<T>();
-    const T* offset_ptr = offset->data<T>();
-    const T* mask_ptr = mask->data<T>();
-    col_buffer.mutable_data<T>(ctx.GetPlace());
-    T* col_buffer_ptr = col_buffer.data<T>();
-    for (int i = 0; i < batch_size / im2col_step; ++i) {
-      ModulatedDeformableIm2colCPU(
-          dev_ctx, input_ptr + i * im2col_step * input_dim,
-          offset_ptr + i * im2col_step * input_offset_dim,
-          mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec,
-          col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations,
-          deformable_groups, col_buffer_ptr);
-      Tensor output_3d = output_4d.Slice(i, i + 1).Resize(
-          phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size()));
-      // get the product of pixel and weight
-      for (int g = 0; g < groups; ++g) {
-        Tensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
-            phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
-        Tensor col_buffer_3d_slice =
-            col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
-                col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-        Tensor output_3d_slice = output_3d.Slice(g, g + 1).Resize(
-            phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size()));
-        blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0),
-                    &output_3d_slice, T(0.0));
-      }
-    }
-    output->ShareDataWith(output_buffer)
-        .Resize(phi::make_ddim(output_shape_vec));
-  }
-};
-
 template <typename T>
 class DeformableConvGradCPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index 0d9fbf612f73c..35e389090175f 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -9,8 +9,10 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -235,10 +237,13 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(yolo_box, YoloBoxInferShapeFunctor,
+                            PD_INFER_META(phi::YoloBoxInferMeta));
 REGISTER_OPERATOR(
     yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    YoloBoxInferShapeFunctor);
 
 REGISTER_OP_VERSION(yolo_box)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc
index 98247fbc862bb..6959b5cf81106 100644
--- a/paddle/fluid/operators/determinant_op.cc
+++ b/paddle/fluid/operators/determinant_op.cc
@@ -13,6 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/determinant_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,11 +24,6 @@ namespace operators {
 class DeterminantOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "determinant");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "determinant");
-  }
 };
 
 class DeterminantOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -44,19 +43,6 @@ class DeterminantGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input",
-                   "DeterminantGradOp");
-    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "DeterminantGradOp");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
-                   framework::GradVarName("Out"), "DeterminantGradOp");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")), "Output",
-                   framework::GradVarName("Input"), "DeterminantGradOp");
-
-    ctx->SetOutputDim(framework::GradVarName("Input"),
-                      ctx->GetInputDim("Input"));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -162,19 +148,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(SlogDeterminantGradNoNeedBufferVarsInferer,
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+DECLARE_INFER_SHAPE_FUNCTOR(determinant, DeterminantInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(determinant, ops::DeterminantOp, ops::DeterminantOpMaker,
                   ops::DeterminantGradOpMaker<paddle::framework::OpDesc>,
-                  ops::DeterminantGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp)
+                  ops::DeterminantGradOpMaker<paddle::imperative::OpBase>,
+                  DeterminantInferShapeFunctor);
 
-REGISTER_OP_CPU_KERNEL(determinant,
-                       ops::DeterminantKernel<plat::CPUDeviceContext, float>,
-                       ops::DeterminantKernel<plat::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    determinant_grad, ops::DeterminantGradKernel<plat::CPUDeviceContext, float>,
-    ops::DeterminantGradKernel<plat::CPUDeviceContext, double>);
+DECLARE_INFER_SHAPE_FUNCTOR(determinant_grad, DeterminantGradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralUnaryGradInferMeta));
+REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp,
+                  DeterminantGradInferShapeFunctor);
 
 REGISTER_OPERATOR(slogdeterminant, ops::SlogDeterminantOp,
                   ops::SlogDeterminantOpMaker,
diff --git a/paddle/fluid/operators/determinant_op.cu b/paddle/fluid/operators/determinant_op.cu
index d19d4c3d09386..d8237fa3004e6 100644
--- a/paddle/fluid/operators/determinant_op.cu
+++ b/paddle/fluid/operators/determinant_op.cu
@@ -17,14 +17,6 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    determinant, ops::DeterminantKernel<plat::CUDADeviceContext, float>,
-    ops::DeterminantKernel<plat::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    determinant_grad,
-    ops::DeterminantGradKernel<plat::CUDADeviceContext, float>,
-    ops::DeterminantGradKernel<plat::CUDADeviceContext, double>);
 
 REGISTER_OP_CUDA_KERNEL(
     slogdeterminant, ops::SlogDeterminantKernel<plat::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h
index f89ecd3722287..a1fe8a25665ec 100644
--- a/paddle/fluid/operators/determinant_op.h
+++ b/paddle/fluid/operators/determinant_op.h
@@ -22,12 +22,15 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/matrix_inverse.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/determinant_kernel_impl.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 
@@ -40,232 +43,6 @@ T sign(T val) {
   return static_cast<T>(T(0) < val) - (val < T(0));
 }
 
-template <typename T>
-class EigenMatrix {};
-
-template <>
-class EigenMatrix<float> {
- public:
-  using MatrixType = Eigen::MatrixXf;
-};
-
-template <>
-class EigenMatrix<double> {
- public:
-  using MatrixType = Eigen::MatrixXd;
-};
-
-inline int64_t GetBatchCount(const framework::DDim dims) {
-  int64_t batch_count = 1;
-  auto dim_size = dims.size();
-  PADDLE_ENFORCE_GE(
-      dim_size, 2,
-      platform::errors::InvalidArgument(
-          "the input matrix dimension size should greater than 2."));
-
-  // Cumulative multiplying each dimension until the last 2 to get the batch
-  // count,
-  // for example a tensor with shape [3,3,3,3], the batch count of matrices is
-  // 9.
-  for (int64_t i = 0; i < dims.size() - 2; i++) {
-    batch_count *= dims[i];
-  }
-
-  return batch_count;
-}
-
-template <typename T>
-struct DeterminantFunctor {
-  void operator()(const Tensor& input, const framework::ExecutionContext ctx,
-                  int64_t rank, int64_t batch_count, Tensor* output) {
-    std::vector<T> input_vec;
-    std::vector<T> output_vec;
-    framework::TensorToVector(input, ctx.device_context(), &input_vec);
-    for (int64_t i = 0; i < batch_count; ++i) {  // maybe can be parallel
-      auto begin_iter = input_vec.begin() + i * rank * rank;
-      auto end_iter = input_vec.begin() + (i + 1) * rank * rank;
-      std::vector<T> sub_vec(begin_iter,
-                             end_iter);  // get every square matrix data
-      typename EigenMatrix<T>::MatrixType matrix(rank, rank);
-      for (int64_t i = 0; i < rank; ++i) {
-        for (int64_t j = 0; j < rank; ++j) {
-          matrix(i, j) = sub_vec[rank * i + j];
-        }
-      }
-      output_vec.push_back(matrix.determinant());
-    }
-    framework::TensorFromVector(output_vec, output);
-  }
-};
-template <typename DeviceContext, typename T>
-class DeterminantKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<framework::Tensor>("Input");
-    auto input_dim = vectorize(input->dims());
-    auto input_dim_size = input_dim.size();
-    auto* output = context.Output<framework::Tensor>("Out");
-
-    auto batch_count = GetBatchCount(input->dims());
-    VLOG(2) << "input dim:" << input->dims();
-    PADDLE_ENFORCE_GE(
-        input_dim_size, 2,
-        platform::errors::InvalidArgument(
-            "the input matrix dimension size should greater than 2."));
-    PADDLE_ENFORCE_EQ(input_dim[input_dim_size - 1],
-                      input_dim[input_dim_size - 2],
-                      platform::errors::InvalidArgument(
-                          "the input matrix should be square matrix."));
-    auto rank = input_dim[input_dim_size - 1];  // square matrix length
-    DeterminantFunctor<T>()(*input, context, rank, batch_count, output);
-    auto output_dims = phi::slice_ddim(input->dims(), 0, input_dim_size - 2);
-    if (input_dim_size > 2) {
-      output->Resize(output_dims);
-    } else {
-      // when input is a two-dimension matrix, The det value is a number.
-      output->Resize({1});
-    }
-    VLOG(2) << "output dim:" << output->dims();
-  }
-};
-
-template <typename T>
-struct FoundZeroFunctor {
-  FoundZeroFunctor(const T* x, int64_t numel, bool* res)
-      : x_(x), numel_(numel), res_(res) {}
-  HOSTDEVICE void operator()(size_t idx) const {
-    if (*res_ || idx >= static_cast<size_t>(numel_)) {
-      // founded zero number
-      return;
-    }
-    *res_ = (x_[idx] == static_cast<T>(0));
-  }
-  const T* x_;
-  int64_t numel_;
-  bool* res_;
-};
-
-template <typename DeviceContext, typename T>
-inline bool CheckMatrixInvertible(const framework::ExecutionContext& ctx,
-                                  const framework::Tensor* det) {
-  auto& dev_ctx = ctx.template device_context<DeviceContext>();
-  auto numel = det->numel();
-
-  framework::Tensor dev_tensor;
-  auto* data = dev_tensor.mutable_data<bool>({1}, ctx.GetPlace());
-
-  // set false
-  phi::funcs::SetConstant<DeviceContext, bool> zero;
-  zero(dev_ctx, &dev_tensor, false);
-
-  // find whether zero
-  platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-  FoundZeroFunctor<T> functor(det->data<T>(), numel, data);
-  for_range(functor);
-
-  // copy to host
-  dev_ctx.Wait();
-  framework::Tensor cpu_tensor;
-  framework::TensorCopy(dev_tensor, platform::CPUPlace(), &cpu_tensor);
-
-  // if founded zero, the matrix is not invertible
-  // else the matrix is invertible
-  auto* res = cpu_tensor.data<bool>();
-  return !(*res);
-}
-
-template <typename DeviceContext, typename T>
-class DeterminantGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& orig_dev_ctx = context.template device_context<DeviceContext>();
-    const auto* input = context.Input<framework::Tensor>("Input");
-    const auto* det = context.Input<framework::Tensor>("Out");
-    const auto* grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* ddet =
-        context.Output<framework::Tensor>(framework::GradVarName("Input"));
-
-    auto input_dims_size = input->dims().size();
-    if (input_dims_size > 2) {
-      PADDLE_ENFORCE_EQ(
-          grad->dims().size() + 2, input_dims_size,
-          platform::errors::InvalidArgument(
-              "The grad tensor of det dims size should 2 less than"
-              " input tensor's, but here differ %d",
-              input_dims_size - grad->dims().size()));
-    } else if (input_dims_size == 2) {
-      // input dims size 2 and grad dims size 1 is possible
-      PADDLE_ENFORCE_EQ(
-          grad->dims().size(), 1,
-          platform::errors::InvalidArgument(
-              "The grad tensor of det dims size should 2 less than"
-              " input tensor's, but here differ %d",
-              input_dims_size - grad->dims().size()));
-    } else {
-      // checked in forward, pass
-    }
-
-    auto& dev_ctx = static_cast<
-        const typename framework::ConvertToPhiContext<DeviceContext>::TYPE&>(
-        orig_dev_ctx);
-
-    // Check Whether the matrix is invertible
-    // (matrix A not invertible) == (det(A)=0)
-    if (!CheckMatrixInvertible<DeviceContext, T>(context, det)) {
-      // The matrix is not invertible
-      VLOG(3) << "The input matrix not invertible!";
-      ddet->Resize(input->dims());
-      phi::Full<T>(dev_ctx, phi::vectorize(input->dims()), static_cast<T>(0.0f),
-                   ddet);
-      return;
-    }
-
-    // The matrix is invertible
-    // let |A| = Determinant(A)
-    // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf
-    // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2,
-    // -1)
-
-    // First: inverse(A)
-    framework::Tensor inverse_A;
-    // A must be square matrices!
-    inverse_A.Resize(input->dims());
-    inverse_A.mutable_data<T>(context.GetPlace());
-
-    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(orig_dev_ctx, *input, &inverse_A);
-
-    VLOG(3) << "inverse(A) dims: " << inverse_A.dims();
-
-    // Second: inverse(A).transpose(-2, -1)
-    framework::Tensor transpose_inverse_A =
-        phi::TransposeLast2Dim<T>(dev_ctx, inverse_A);
-
-    VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: "
-            << transpose_inverse_A.dims();
-
-    // Third: dA * |A|
-    auto mul_dA_detA = phi::Multiply<T>(dev_ctx, *grad, *det);
-    VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims();
-
-    // Fourth: unsqueeze(dA * |A|, [-1, -2])
-    auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1);
-    auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2);
-    VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims();
-
-    // Finally: unsqueeze(dA * |A|) * inverse(A)
-    auto res = phi::Multiply<T>(dev_ctx, unsqueeze2, transpose_inverse_A);
-
-    VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims();
-
-    framework::TensorCopy(res, context.GetPlace(), ddet);
-
-    ddet->Resize(input->dims());
-    VLOG(3) << "d|A| dims: " << ddet->dims();
-  }
-};
-
 template <typename T>
 struct SlogDeterminantFunctor {
   void operator()(const Tensor& input, const framework::ExecutionContext ctx,
@@ -280,7 +57,7 @@ struct SlogDeterminantFunctor {
       auto end_iter = input_vec.begin() + (i + 1) * rank * rank;
       std::vector<T> sub_vec(begin_iter,
                              end_iter);  // get every square matrix data
-      typename EigenMatrix<T>::MatrixType matrix(rank, rank);
+      typename phi::detail::EigenMatrix<T>::MatrixType matrix(rank, rank);
       for (int64_t i = 0; i < rank; ++i) {
         for (int64_t j = 0; j < rank; ++j) {
           matrix(i, j) = sub_vec[rank * i + j];
@@ -311,7 +88,7 @@ class SlogDeterminantKernel : public framework::OpKernel<T> {
     auto input_dim_size = input_dim.size();
     auto* output = context.Output<framework::Tensor>("Out");
 
-    auto batch_count = GetBatchCount(input->dims());
+    auto batch_count = phi::detail::GetBatchCount(input->dims());
     VLOG(2) << "input dim:" << input->dims();
     PADDLE_ENFORCE_GE(
         input_dim_size, 2,
@@ -370,7 +147,9 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
     // (matrix A not invertible) == (absslogdet(A)=0)
     auto slogdet_vec = slogdet->Split(1, 0);
     auto absslogdet_val = slogdet_vec[0];
-    if (!CheckMatrixInvertible<DeviceContext, T>(context, &absslogdet_val)) {
+    if (!phi::detail::CheckMatrixInvertible<
+            T, typename framework::ConvertToPhiContext<DeviceContext>::TYPE>(
+            dev_ctx, &absslogdet_val)) {
       // The matrix is not invertible
       VLOG(3) << "The input matrix not invertible!";
       dslogdet->Resize(input->dims());
diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc
index 93fbff67e220b..ac8c12bcd7eba 100644
--- a/paddle/fluid/operators/diag_v2_op.cc
+++ b/paddle/fluid/operators/diag_v2_op.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <algorithm>
-
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/infermeta/unary.h"
@@ -58,15 +56,56 @@ class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+class DiagV2GradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "X", "X", "DiagV2Grad");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                   framework::GradVarName("X"), "DiagV2Grad");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class DiagV2GradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("diag_v2_grad");
+    grad_op->SetInput("X", this->Input("X"));
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagGradV2NoNeedBufferVarsInferer, "X");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
 DECLARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor,
                             PD_INFER_META(phi::DiagInferMeta));
 
-REGISTER_OPERATOR(
-    diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    DiagInferShapeFunctor);
+REGISTER_OPERATOR(diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker,
+                  ops::DiagV2GradOpMaker<paddle::framework::OpDesc>,
+                  ops::DiagV2GradOpMaker<paddle::imperative::OpBase>,
+                  DiagInferShapeFunctor);
+
+REGISTER_OPERATOR(diag_v2_grad, ops::DiagV2GradOp,
+                  ops::DiagGradV2NoNeedBufferVarsInferer);
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index 17665ad67e40e..144198367d538 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -32,10 +32,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/dropout_impl_util.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
 namespace paddle {
@@ -177,12 +176,13 @@ __global__ void DropoutGradCUDAKernel(
 }
 
 template <typename T>
-void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
-                              bool is_test,
+void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
                               const std::string dropout_implementation,
                               float dropout_prob, bool upscale_in_train,
-                              bool is_fix_seed, int seed_val, const Tensor& x,
-                              const Tensor* seed, Tensor* mask, Tensor* y) {
+                              bool is_fix_seed, int seed_val,
+                              const framework::Tensor& x,
+                              const framework::Tensor* seed,
+                              framework::Tensor* mask, framework::Tensor* y) {
   auto& place = *dev_ctx.eigen_device();
   int64_t x_numel = x.numel();
   auto stream = dev_ctx.stream();
@@ -220,7 +220,8 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     // VectorizedRandomGenerator use curand_uniform4, so we only support
     // vec_size is 4;
     int vec_size = (phi::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
-    auto gpu_config = GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size);
+    auto gpu_config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size);
     auto offset =
         ((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size;
 
@@ -278,11 +279,13 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
 }
 
 template <typename T>
-void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
+void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
                                 const std::string dropout_implementation,
-                                float dropout_prob, const Tensor& grad_y,
-                                const Tensor& mask, int64_t size,
-                                Tensor* grad_x, bool is_test = false) {
+                                float dropout_prob,
+                                const framework::Tensor& grad_y,
+                                const framework::Tensor& mask, int64_t size,
+                                framework::Tensor* grad_x,
+                                bool is_test = false) {
   using MT = typename details::MPTypeTrait<T>::Type;
   auto stream = dev_ctx.stream();
   MT factor;
diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h
index d7db7dddce388..c62d45570ba29 100644
--- a/paddle/fluid/operators/dropout_impl_util.h
+++ b/paddle/fluid/operators/dropout_impl_util.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx,
+inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
                                     const framework::Tensor* seed,
                                     const bool is_fix_seed, const int seed_val,
                                     const int offset, uint64_t* seed_data,
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index 7613b04bccfdc..3d9950902acfe 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/dropout_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -25,17 +27,6 @@ class DropoutOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Dropout");
-
-    auto x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", x_dims);
-    if (ctx->Attrs().Get<bool>("is_test") == false) {
-      ctx->SetOutputDim("Mask", x_dims);
-    }
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -173,18 +164,11 @@ class DropoutGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(dropout, DropoutInferShapeFunctor,
+                            PD_INFER_META(phi::DropoutInferMeta));
+
 REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
                   ops::DropoutGradOpMaker<paddle::framework::OpDesc>,
-                  ops::DropoutGradOpMaker<paddle::imperative::OpBase>);
+                  ops::DropoutGradOpMaker<paddle::imperative::OpBase>,
+                  DropoutInferShapeFunctor);
 REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext,
-                          paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    dropout_grad,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext,
-                           paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
deleted file mode 100644
index f6ddff1d0327d..0000000000000
--- a/paddle/fluid/operators/dropout_op.cu
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/dropout_impl.cu.h"
-#include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
-// Use std::random and thrust::random(thrust is a std library in CUDA) to
-// implement uniform random.
-template <typename Place, typename T>
-class GPUDropoutKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* seed =
-        context.HasInput("Seed") ? context.Input<Tensor>("Seed") : nullptr;
-    auto* y = context.Output<Tensor>("Out");
-    y->mutable_data<T>(context.GetPlace());
-    float dropout_prob = context.Attr<float>("dropout_prob");
-
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    bool upscale_in_train = (dropout_implementation == "upscale_in_train");
-
-    bool is_test = context.Attr<bool>("is_test");
-
-    auto& dev_ctx = context.cuda_device_context();
-    auto* mask = context.Output<Tensor>("Mask");
-    mask->mutable_data<uint8_t>(context.GetPlace());
-
-    bool is_fix_seed = context.Attr<bool>("fix_seed");
-    int seed_val = context.Attr<int>("seed");
-    DropoutFwGPUKernelDriver<T>(dev_ctx, is_test, dropout_implementation,
-                                dropout_prob, upscale_in_train, is_fix_seed,
-                                seed_val, *x, seed, mask, y);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GPUDropoutGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* mask = context.Input<Tensor>("Mask");
-    grad_x->mutable_data<T>(context.GetPlace());
-    auto size = grad_x->numel();
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    float dropout_prob = context.Attr<float>("dropout_prob");
-
-    bool is_test = context.Attr<bool>("is_test");
-
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    DropoutGradGPUKernelDriver<T>(dev_ctx, dropout_implementation, dropout_prob,
-                                  *grad_y, *mask, size, grad_x, is_test);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>,
-    ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::bfloat16>,
-    ops::GPUDropoutKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    dropout_grad, ops::GPUDropoutGradKernel<plat::CUDADeviceContext, float>,
-    ops::GPUDropoutGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::GPUDropoutGradKernel<plat::CUDADeviceContext, plat::bfloat16>,
-    ops::GPUDropoutGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
deleted file mode 100644
index ea6ed0e619474..0000000000000
--- a/paddle/fluid/operators/dropout_op.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <cstring>
-#include <random>
-#include <string>
-
-#include <algorithm>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class CPUDropoutKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* seed =
-        context.HasInput("Seed") ? context.Input<Tensor>("Seed") : nullptr;
-    auto* y = context.Output<Tensor>("Out");
-    const auto* x_data = x->data<T>();
-    auto* y_data = y->mutable_data<T>(context.GetPlace());
-    float dropout_prob = context.Attr<float>("dropout_prob");
-
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    bool upscale_in_train = (dropout_implementation == "upscale_in_train");
-    if (!context.Attr<bool>("is_test")) {
-      auto* mask = context.Output<Tensor>("Mask");
-      auto* mask_data = mask->mutable_data<uint8_t>(context.GetPlace());
-      size_t size = phi::product(mask->dims());
-
-      // Special case when dropout_prob is 1.0
-      if (dropout_prob == 1.0f) {
-        std::memset(y_data, 0, size * sizeof(*y_data));        // NOLINT
-        std::memset(mask_data, 0, size * sizeof(*mask_data));  // NOLINT
-        return;
-      }
-      // std::minstd_rand engine;
-      // NOTE: fixed seed should only be used in unittest or for debug.
-      // Guarantee to use random seed in training.
-      int seed_data = 0;
-      if (seed) {
-        seed_data = *(seed->data<int>());
-      } else {
-        seed_data =
-            context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : 0;
-      }
-      auto engine = framework::GetCPURandomEngine(seed_data);
-
-      std::uniform_real_distribution<float> dist(0, 1);
-
-      for (size_t i = 0; i < size; ++i) {
-        if (dist(*engine) < dropout_prob) {
-          mask_data[i] = 0;
-          y_data[i] = 0;
-        } else {
-          mask_data[i] = 1;
-          if (upscale_in_train) {
-            y_data[i] = x_data[i] / static_cast<T>(1.0f - dropout_prob);
-          } else {
-            y_data[i] = x_data[i];
-          }
-        }
-      }
-    } else {
-      if (upscale_in_train) {
-        const auto* X_data = x->data<T>();
-        auto* Y_data = y->mutable_data<T>(context.GetPlace());
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-        for (int i = 0; i < x->numel(); i++) {
-          Y_data[i] = X_data[i];
-        }
-      } else {
-        auto X = EigenMatrix<T>::Reshape(*x, 1);
-        auto Y = EigenMatrix<T>::Reshape(*y, 1);
-        auto& place =
-            *context.template device_context<DeviceContext>().eigen_device();
-        Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
-      }
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class DropoutGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* mask = context.Input<Tensor>("Mask");
-    grad_x->mutable_data<T>(context.GetPlace());
-
-    auto dX = EigenVector<T>::Flatten(*grad_x);
-    auto dY = EigenVector<T>::Flatten(*grad_y);
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    if (context.Attr<bool>("is_test") == true) {
-      if (dropout_implementation == "upscale_in_train") {
-        dX.device(place) = static_cast<T>(1) * dY;
-      } else {
-        float dropout_prob = context.Attr<float>("dropout_prob");
-        dX.device(place) = dY * static_cast<T>(1.0f - dropout_prob);
-      }
-    } else {
-      auto M = EigenVector<uint8_t>::Flatten(*mask);
-      if (dropout_implementation == "upscale_in_train") {
-        float dropout_prob = context.Attr<float>("dropout_prob");
-        if (dropout_prob == 1.0f) {
-          dX.device(place) = static_cast<T>(0) * dY;
-        } else {
-          dX.device(place) =
-              dY * M.cast<T>() / static_cast<T>(1.0f - dropout_prob);
-        }
-      } else {
-        dX.device(place) = dY * M.cast<T>();
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc
index 6aae566760623..07b3b53811625 100644
--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ b/paddle/fluid/operators/dropout_op_npu.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/core/ddim.h"
 
diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc
index 206d9a6c5e9c9..bdf08646f1d8b 100644
--- a/paddle/fluid/operators/dropout_op_test.cc
+++ b/paddle/fluid/operators/dropout_op_test.cc
@@ -24,14 +24,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(dropout);
+USE_OP_ITSELF(dropout);
 
 void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   // init
diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc
index 07b7e2cc7c09b..7d8660f238abc 100644
--- a/paddle/fluid/operators/dropout_op_xpu.cc
+++ b/paddle/fluid/operators/dropout_op_xpu.cc
@@ -8,15 +8,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/dropout_op.h"
+
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 namespace paddle {
 namespace operators {
 
 #ifdef PADDLE_WITH_XPU
 
+using Tensor = framework::Tensor;
 template <typename DeviceContext, typename T>
 class DropoutXPUKernel : public framework::OpKernel<T> {
   using XPUTyp = typename XPUTypeTrait<T>::Type;
diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h
index 5e4c83e1a45eb..6daf05a9d778d 100644
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -21,13 +21,13 @@
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/slice.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index a995877778e47..c28abb916b7a7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -27,7 +27,7 @@ limitations under the License. */
 
 // only can include the headers in paddle/phi/include dirs
 #include "paddle/phi/kernels/elementwise_grad_kernel.h"
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
index 8e0bf78e9b7f9..54931d99292f9 100644
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -90,86 +87,6 @@ struct MinFunctor {
 template <typename T>
 using Complex = paddle::platform::complex<T>;
 
-// Fmax
-template <typename T>
-struct FMaxFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-    return std::fmax(a, b);
-  }
-};
-
-template <>
-struct FMaxFunctor<paddle::platform::float16> {
-  inline HOSTDEVICE paddle::platform::float16 operator()(
-      const paddle::platform::float16 a,
-      const paddle::platform::float16 b) const {
-    float float_a = static_cast<float>(a);
-    float float_b = static_cast<float>(b);
-    auto result = std::fmax(float_a, float_b);
-    return static_cast<paddle::platform::float16>(result);
-  }
-};
-
-template <>
-struct FMaxFunctor<int> {
-  inline HOSTDEVICE int operator()(const int a, const int b) const {
-    float float_a = static_cast<float>(a);
-    float float_b = static_cast<float>(b);
-    auto result = std::fmax(float_a, float_b);
-    return std::lrint(result);
-  }
-};
-
-template <>
-struct FMaxFunctor<int64_t> {
-  inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const {
-    double double_a = static_cast<double>(a);
-    double double_b = static_cast<double>(b);
-    auto result = std::fmax(double_a, double_b);
-    return std::llrint(result);
-  }
-};
-
-// Fmin
-template <typename T>
-struct FMinFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-    return std::fmin(a, b);
-  }
-};
-
-template <>
-struct FMinFunctor<paddle::platform::float16> {
-  inline HOSTDEVICE paddle::platform::float16 operator()(
-      const paddle::platform::float16 a,
-      const paddle::platform::float16 b) const {
-    float float_a = static_cast<float>(a);
-    float float_b = static_cast<float>(b);
-    auto result = std::fmin(float_a, float_b);
-    return static_cast<paddle::platform::float16>(result);
-  }
-};
-
-template <>
-struct FMinFunctor<int> {
-  inline HOSTDEVICE int operator()(const int a, const int b) const {
-    float float_a = static_cast<float>(a);
-    float float_b = static_cast<float>(b);
-    auto result = std::fmin(float_a, float_b);
-    return std::lrint(result);
-  }
-};
-
-template <>
-struct FMinFunctor<int64_t> {
-  inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const {
-    double double_a = static_cast<double>(a);
-    double double_b = static_cast<double>(b);
-    auto result = std::fmin(double_a, double_b);
-    return std::llrint(result);
-  }
-};
-
 template <typename T>
 struct MinGradXFunctor {
   inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
@@ -196,47 +113,6 @@ struct MinGradXYFunctor {
   }
 };
 
-template <typename T>
-struct MulGradFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const { return a * b; }
-};
-template <typename T>
-struct MulGradFunctor<Complex<T>> {
-  inline HOSTDEVICE Complex<T> operator()(const Complex<T> a,
-                                          const Complex<T> b) const {
-    Complex<T> b_conj(b.real, -b.imag);
-    return a * b_conj;
-  }
-};
-
-template <typename InT, typename OutT>
-struct MulGradXYFunctor {
-  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT a, const InT b,
-                                                   const InT c) {
-    phi::Array<OutT, 2> outs;
-    // dx = dout * y
-    outs[0] = a * b;
-    // dy = dout * x
-    outs[1] = a * c;
-    return outs;
-  }
-};
-
-template <typename InT, typename OutT>
-struct MulGradXYFunctor<Complex<InT>, Complex<OutT>> {
-  inline HOSTDEVICE phi::Array<Complex<OutT>, 2> operator()(
-      const Complex<InT> a, const Complex<InT> b, const Complex<InT> c) {
-    phi::Array<Complex<OutT>, 2> outs;
-    // dx = dout * y
-    Complex<InT> b_conj(b.real, -b.imag);
-    outs[0] = a * b_conj;
-    // dy = dout * x
-    Complex<InT> c_conj(c.real, -c.imag);
-    outs[1] = a * c_conj;
-    return outs;
-  }
-};
-
 // Ternary compare
 template <typename T>
 struct MaxGradXFunctor {
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
index 91da732ef0d3d..d91315cc511aa 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
@@ -151,21 +151,3 @@ REGISTER_OPERATOR(elementwise_fmax, ops::ElementwiseOp,
                   ops::ElementwiseFMaxGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(elementwise_fmax_grad, ops::ElementwiseOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_fmax,
-    ops::ElementwiseFMaxKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CPUDeviceContext,
-                               paddle::platform::float16>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_fmax_grad,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CPUDeviceContext,
-                                   paddle::platform::float16>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CPUDeviceContext,
-                                   int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
index 123332a4a23de..0d5f56fda1732 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
@@ -86,21 +86,3 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext,
                                   int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_fmax,
-    ops::ElementwiseFMaxKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CUDADeviceContext,
-                               paddle::platform::float16>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_fmax_grad,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CUDADeviceContext,
-                                   paddle::platform::float16>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.h b/paddle/fluid/operators/elementwise/elementwise_max_op.h
index cff30be50a3d1..afe1073d89a06 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.h
@@ -35,21 +35,6 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
-class ElementwiseFMaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<FMaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                           FMaxFunctor<T>(), z);
-  }
-};
-
 template <typename T>
 struct MaxGradDx {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
@@ -104,88 +89,5 @@ class ElementwiseMaxGradKernel : public ElemwiseGradKernel<T> {
   }
 };
 
-template <typename T>
-struct FMaxGradDx {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * static_cast<T>((x >= y) || isnan(y));
-  }
-};
-
-template <>
-struct FMaxGradDx<paddle::platform::float16> {
-  HOSTDEVICE paddle::platform::float16 operator()(
-      paddle::platform::float16 x, paddle::platform::float16 y,
-      paddle::platform::float16 out, paddle::platform::float16 dout) const {
-    return dout * static_cast<paddle::platform::float16>(
-                      (x >= y) || paddle::platform::isnan(y));
-  }
-};
-
-template <>
-struct FMaxGradDx<int> {
-  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
-    return dout * static_cast<int>((x >= y));
-  }
-};
-
-template <>
-struct FMaxGradDx<int64_t> {
-  HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out,
-                                int64_t dout) const {
-    return dout * static_cast<int64_t>((x >= y));
-  }
-};
-
-template <typename T>
-struct FMaxGradDy {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * static_cast<T>(!((x >= y) || isnan(y)));
-  }
-};
-
-template <>
-struct FMaxGradDy<paddle::platform::float16> {
-  HOSTDEVICE paddle::platform::float16 operator()(
-      paddle::platform::float16 x, paddle::platform::float16 y,
-      paddle::platform::float16 out, paddle::platform::float16 dout) const {
-    return dout * static_cast<paddle::platform::float16>(
-                      !((x >= y) || paddle::platform::isnan(y)));
-  }
-};
-
-template <>
-struct FMaxGradDy<int64_t> {
-  HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out,
-                                int64_t dout) const {
-    return dout * static_cast<int64_t>(!((x >= y)));
-  }
-};
-
-template <>
-struct FMaxGradDy<int> {
-  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
-    return dout * static_cast<int>(!((x >= y)));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseFMaxGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* out = dout;  // Fake out, not used
-    int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, FMaxGradDx<T>, FMaxGradDy<T>>(
-        ctx, *x, *y, *out, *dout, axis, dx, dy, FMaxGradDx<T>(),
-        FMaxGradDy<T>());
-  }
-};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cc b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
index 3a1951999546e..dad80a2c33f3a 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
@@ -147,21 +147,3 @@ REGISTER_OPERATOR(elementwise_fmin, ops::ElementwiseOp,
                   ops::ElementwiseFMinGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(elementwise_fmin_grad, ops::ElementwiseOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_fmin,
-    ops::ElementwiseFMinKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseFMinKernel<paddle::platform::CPUDeviceContext,
-                               paddle::platform::float16>,
-    ops::ElementwiseFMinKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseFMinKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseFMinKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_fmin_grad,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CPUDeviceContext,
-                                   paddle::platform::float16>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CPUDeviceContext,
-                                   int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cu b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
index 5af985567d898..fb8bc9ac7f83c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
@@ -82,21 +82,3 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext,
                                   int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_fmin,
-    ops::ElementwiseFMinKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseFMinKernel<paddle::platform::CUDADeviceContext,
-                               paddle::platform::float16>,
-    ops::ElementwiseFMinKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseFMinKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseFMinKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_fmin_grad,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CUDADeviceContext,
-                                   paddle::platform::float16>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.h b/paddle/fluid/operators/elementwise/elementwise_min_op.h
index 88fb044d42206..283ad2adde978 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.h
@@ -35,21 +35,6 @@ class ElementwiseMinKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
-class ElementwiseFMinKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<FMinFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                           FMinFunctor<T>(), z);
-  }
-};
-
 template <typename T>
 struct MinGradDx {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
@@ -124,89 +109,5 @@ class ElementwiseMinGradKernel : public ElemwiseGradKernel<T> {
     ElementwiseMinGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
   }
 };
-
-template <typename T>
-struct FMinGradDx {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * static_cast<T>((x <= y) || isnan(y));
-  }
-};
-
-template <>
-struct FMinGradDx<paddle::platform::float16> {
-  HOSTDEVICE paddle::platform::float16 operator()(
-      paddle::platform::float16 x, paddle::platform::float16 y,
-      paddle::platform::float16 out, paddle::platform::float16 dout) const {
-    return dout * static_cast<paddle::platform::float16>(
-                      (x <= y) || paddle::platform::isnan(y));
-  }
-};
-
-template <>
-struct FMinGradDx<int> {
-  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
-    return dout * static_cast<int>((x <= y));
-  }
-};
-
-template <>
-struct FMinGradDx<int64_t> {
-  HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out,
-                                int64_t dout) const {
-    return dout * static_cast<int64_t>((x <= y));
-  }
-};
-
-template <typename T>
-struct FMinGradDy {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * static_cast<T>(!((x <= y) || isnan(y)));
-  }
-};
-
-template <>
-struct FMinGradDy<paddle::platform::float16> {
-  HOSTDEVICE paddle::platform::float16 operator()(
-      paddle::platform::float16 x, paddle::platform::float16 y,
-      paddle::platform::float16 out, paddle::platform::float16 dout) const {
-    return dout * static_cast<paddle::platform::float16>(
-                      !((x <= y) || paddle::platform::isnan(y)));
-  }
-};
-
-template <>
-struct FMinGradDy<int> {
-  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
-    return dout * static_cast<int>(!((x <= y)));
-  }
-};
-
-template <>
-struct FMinGradDy<int64_t> {
-  HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out,
-                                int64_t dout) const {
-    return dout * static_cast<int64_t>(!((x <= y)));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseFMinGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* out = dout;  // Fake out, not used
-    int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, FMinGradDx<T>, FMinGradDy<T>>(
-        ctx, *x, *y, *out, *dout, axis, dx, dy, FMinGradDx<T>(),
-        FMinGradDy<T>());
-  }
-};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
index e172279145e28..830e09eeae481 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
@@ -173,55 +173,6 @@ REGISTER_OP_CPU_KERNEL(
                               paddle::platform::complex<float>>,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext,
                               paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::bfloat16>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul_grad_grad,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        bool>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::bfloat16>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul_triple_grad,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        bool>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::bfloat16>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
 
 REGISTER_OP_VERSION(elementwise_mul)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index 45c87a27a180a..f7b9fd1e265f5 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -63,33 +63,6 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
   }
 };
 
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseMulGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  const auto& dev_ctx =
-      ctx.template device_context<platform::CUDADeviceContext>();
-  const auto place = ctx.GetPlace();
-
-  if (dx != nullptr && dy != nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, y, x};
-    GetGradXAndYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dx, dy, MulGradXYFunctor<T, T>());
-  } else if (dx != nullptr && dy == nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, y};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
-                                                dx, MulGradFunctor<T>());
-  } else if (dx == nullptr && dy != nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, x};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
-                                                dy, MulGradFunctor<T>());
-  }
-}
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -103,44 +76,3 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::bfloat16>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex<float>>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, bool>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::bfloat16>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext,
-                                  plat::complex<float>>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext,
-                                  plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul_grad_grad,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, bool>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::bfloat16>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<float>>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul_triple_grad,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, bool>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
-                                        plat::bfloat16>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<float>>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index c81266d584468..6f4aba93d56e2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -137,244 +137,6 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
     }
   }
 };
-template <typename T>
-struct MulGradDX {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
-};
-
-template <typename T>
-struct MulGradDX<paddle::platform::complex<T>> {
-  HOSTDEVICE paddle::platform::complex<T> operator()(
-      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
-      paddle::platform::complex<T> out,
-      paddle::platform::complex<T> dout) const {
-    paddle::platform::complex<T> y_conj(y.real, -y.imag);
-    return dout * y_conj;
-  }
-};
-
-template <typename T>
-struct MulGradDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
-};
-
-template <typename T>
-struct MulGradDY<paddle::platform::complex<T>> {
-  HOSTDEVICE paddle::platform::complex<T> operator()(
-      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
-      paddle::platform::complex<T> out,
-      paddle::platform::complex<T> dout) const {
-    paddle::platform::complex<T> x_conj(x.real, -x.imag);
-    return dout * x_conj;
-  }
-};
 
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-ElementwiseMulGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-      ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseMulGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy);
-#endif
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out = dout;  // out is not necessary
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    ElementwiseMulGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>("DOut");
-    auto* ddx = ctx.Input<Tensor>("DDX");
-    auto* ddy = ctx.Input<Tensor>("DDY");
-
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* ddout = ctx.Output<Tensor>("DDOut");
-
-    if (ddout) ddout->mutable_data<T>(ctx.GetPlace());
-
-    Tensor ddx_safe, ddy_safe;
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, x, ddx, &ddx_safe);
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, y, ddy, &ddy_safe);
-
-    // dx = dout * ddy
-    // dy = dout * ddx
-    // ddout = ddx * y + x * ddy
-    // change computation sequence to save memory, so ddout can inplace ddx and
-    // dx can be used as 'tmp' tensor
-    // (1) dx = x * ddy
-    // (2) dy = dout * ddx
-    // (3) ddout = ddx * y
-    // (4) ddout = ddout + dx
-    // (5) dx = dout * ddy
-    if (ddout) {
-      int axis = ctx.Attr<int>("axis");
-      auto& place =
-          *ctx.template device_context<DeviceContext>().eigen_device();
-      // size(ddout) > size(ddx), ddout can't use memory of ddx using inplace
-      if (ddout->numel() > ddx->numel()) {
-        ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-            ctx, ddx_safe, ddy_safe, *dout, *dout, axis, dx, dy, MulGradDX<T>(),
-            MulGradDY<T>());
-
-        Tensor ddout_tmp;
-        ddout_tmp.mutable_data<T>(ddout->dims(), ctx.GetPlace());
-
-        default_elementwise_mul<DeviceContext, T>(ctx, y, &ddx_safe, ddout);
-        default_elementwise_mul<DeviceContext, T>(ctx, &ddy_safe, x,
-                                                  &ddout_tmp);
-
-        auto ddout_t = framework::EigenVector<T>::Flatten(*ddout);
-        auto ddout_tmp_t = framework::EigenVector<T>::Flatten(ddout_tmp);
-        ddout_t.device(place) = ddout_t + ddout_tmp_t;
-      } else {
-        // use dx to save memory, other than alloc tmp tensor
-        Tensor* ddout_tmp = dx;
-
-        default_elementwise_mul<DeviceContext, T>(ctx, x, &ddy_safe, ddout_tmp);
-        // NOTE: in the following ElemwiseGradCompute, for the
-        // first output tensor is nullptr, the branch to calculate first
-        // output tensor will not be activated, DivGradDx function will not
-        // be called and can be ignored, the first branch has little effect
-        // on running speed.
-        ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-            ctx, ddx_safe, ddy_safe, *dout, *dout, axis, nullptr, dy,
-            MulGradDX<T>(), MulGradDY<T>());
-        default_elementwise_mul<DeviceContext, T>(ctx, &ddx_safe, y, ddout);
-
-        auto ddout_t = framework::EigenVector<T>::Flatten(*ddout);
-        auto ddout_tmp_t = framework::EigenVector<T>::Flatten(*ddout_tmp);
-        ddout_t.device(place) = ddout_t + ddout_tmp_t;
-        default_elementwise_mul<DeviceContext, T>(ctx, dout, &ddy_safe, dx);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulTripleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-    // get input
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dout = ctx.Input<framework::Tensor>("DOut");
-    auto* ddx = ctx.Input<framework::Tensor>("DDX");
-    auto* ddy = ctx.Input<framework::Tensor>("DDY");
-
-    auto* d_dx = ctx.Input<framework::Tensor>("D_DX");
-    auto* d_dy = ctx.Input<framework::Tensor>("D_DY");
-    auto* d_ddout = ctx.Input<framework::Tensor>("D_DDOut");
-
-    // get output
-    auto* out_d_x = ctx.Output<framework::Tensor>("D_X");
-    auto* out_d_y = ctx.Output<framework::Tensor>("D_Y");
-    auto* out_d_dout = ctx.Output<framework::Tensor>("D_DOut");
-
-    auto* out_d_ddx = ctx.Output<framework::Tensor>("D_DDX");
-    auto* out_d_ddy = ctx.Output<framework::Tensor>("D_DDY");
-
-    if (out_d_x) out_d_x->mutable_data<T>(x->dims(), ctx.GetPlace());
-    if (out_d_y) out_d_y->mutable_data<T>(y->dims(), ctx.GetPlace());
-    if (out_d_dout) out_d_dout->mutable_data<T>(dout->dims(), ctx.GetPlace());
-    if (out_d_ddx) out_d_ddx->mutable_data<T>(x->dims(), ctx.GetPlace());
-    if (out_d_ddy) out_d_ddy->mutable_data<T>(y->dims(), ctx.GetPlace());
-
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    Tensor ddx_safe, ddy_safe;
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, x, ddx, &ddx_safe);
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, y, ddy, &ddy_safe);
-
-    if (d_ddout) {
-      if (out_d_x) {
-        // out_d_x = ddy * d_ddout
-        default_elementwise_mul<DeviceContext, T>(ctx, &ddy_safe, d_ddout,
-                                                  out_d_x);
-      }
-      if (out_d_y) {
-        // out_d_y = ddx * d_ddout
-        default_elementwise_mul<DeviceContext, T>(ctx, &ddx_safe, d_ddout,
-                                                  out_d_y);
-      }
-    }
-
-    if (out_d_dout) {
-      // get out_d_dout
-      // out_d_dout = ddy * d_dx + d_dy * ddx
-      Tensor out_d_dout_tmp;
-      out_d_dout_tmp.mutable_data<T>(dout->dims(), ctx.GetPlace());
-      default_elementwise_mul<DeviceContext, T>(ctx, d_dy, &ddx_safe,
-                                                out_d_dout);
-      default_elementwise_mul<DeviceContext, T>(ctx, &ddy_safe, d_dx,
-                                                &out_d_dout_tmp);
-      auto out_d_dout_t = framework::EigenVector<T>::Flatten(*out_d_dout);
-      auto out_d_dout_tmp_t =
-          framework::EigenVector<T>::Flatten(out_d_dout_tmp);
-      out_d_dout_t.device(place) = out_d_dout_t + out_d_dout_tmp_t;
-    }
-
-    if (out_d_ddx) {
-      // get out_d_ddx
-      // out_d_ddx = dout * d_dy + y * d_ddout
-      Tensor out_d_ddx_tmp;
-      out_d_ddx_tmp.mutable_data<T>(ddx->dims(), ctx.GetPlace());
-      default_elementwise_mul<DeviceContext, T>(ctx, dout, d_dy, out_d_ddx);
-      default_elementwise_mul<DeviceContext, T>(ctx, y, d_ddout,
-                                                &out_d_ddx_tmp);
-      auto out_d_ddx_t = framework::EigenVector<T>::Flatten(*out_d_ddx);
-      auto out_d_ddx_tmp_t = framework::EigenVector<T>::Flatten(out_d_ddx_tmp);
-      out_d_ddx_t.device(place) = out_d_ddx_t + out_d_ddx_tmp_t;
-    }
-
-    if (out_d_ddy) {
-      // get out_d_ddy
-      // out_d_ddy = dout * d_dx + x * d_ddout
-      Tensor out_d_ddy_tmp;
-      out_d_ddy_tmp.mutable_data<T>(ddy->dims(), ctx.GetPlace());
-      default_elementwise_mul<DeviceContext, T>(ctx, dout, d_dx, out_d_ddy);
-      default_elementwise_mul<DeviceContext, T>(ctx, x, d_ddout,
-                                                &out_d_ddy_tmp);
-      auto out_d_ddy_t = framework::EigenVector<T>::Flatten(*out_d_ddy);
-      auto out_d_ddy_tmp_t = framework::EigenVector<T>::Flatten(out_d_ddy_tmp);
-      out_d_ddy_t.device(place) = out_d_ddy_t + out_d_ddy_tmp_t;
-    }
-  }
-};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
index fc128a88f2096..3e9263fe93acd 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
index 838df2e162591..f9347d281043e 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,100 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
 
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-template <typename T>
-class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
-    const auto& onednn_engine = dev_ctx.GetEngine();
-
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    auto tz = phi::vectorize<int64_t>(dout->dims());
-    memory::data_type dout_type = framework::ToMKLDNNDataType(
-        framework::TransToProtoVarType(dout->dtype()));
-    platform::ReorderMKLDNNHandler handler(
-        tz, framework::TransToProtoVarType(dout->dtype()), dout_type,
-        onednn_engine);
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    auto reorder_src_memory_p = handler.AcquireSrcMemory(
-        dout->format(), platform::to_void_cast(dout->data<T>()));
-
-    if (dx) {
-      auto reorder_dst_memory_p =
-          handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace());
-      auto reorder_p =
-          handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
-      platform::RecordEvent record_reorder(
-          "int_reorder", platform::TracerEventType::UserDefined, 2,
-          platform::EventRole::kUniqueOp);
-      reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
-      astream.wait();
-
-      dx->set_layout(DataLayout::kMKLDNN);
-      dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
-    }
-
-    if (dy) {
-      // Direct copy
-      if (dout->dims() == dy->dims()) {
-        auto reorder_dst_memory_p =
-            handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace());
-        auto reorder_p =
-            handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
-        platform::RecordEvent record_reorder(
-            "int_reorder", platform::TracerEventType::UserDefined, 2,
-            platform::EventRole::kUniqueOp);
-        reorder_p->execute(astream, *reorder_src_memory_p,
-                           *reorder_dst_memory_p);
-        astream.wait();
-
-        dy->set_layout(DataLayout::kMKLDNN);
-        dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
-      } else {
-        // Broadcasting
-        platform::ReductionMKLDNNHandler<T> handler_sum(
-            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine,
-            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy));
-        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
-        auto reduction_p = handler_sum.AcquireForwardPrimitive();
-        reduction_p->execute(astream, {{DNNL_ARG_SRC, *reorder_src_memory_p},
-                                       {DNNL_ARG_DST, *dy_memory_p}});
-        astream.wait();
-
-        dy->set_layout(DataLayout::kMKLDNN);
-        dy->set_format(
-            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
-                phi::vectorize<int64_t>(dy->dims()))));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(
@@ -116,6 +24,8 @@ REGISTER_OP_KERNEL(
     ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_add>,
     ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_add>)
 
-REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::EltwiseAddMKLDNNGradKernel<paddle::platform::bfloat16>,
-                   ops::EltwiseAddMKLDNNGradKernel<float>)
+REGISTER_OP_KERNEL(
+    elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace,
+    ops::EltwiseMKLDNNGradKernel<paddle::platform::bfloat16,
+                                 dnnl::algorithm::binary_add>,
+    ops::EltwiseMKLDNNGradKernel<float, dnnl::algorithm::binary_add>)
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
index 367d602f5902e..c68aa8d3d1b46 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
@@ -1,146 +1,28 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
-
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-template <typename T>
-class EltwiseDivMKLDNNGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-
-    auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* out = ctx.Input<framework::Tensor>("Out");
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-
-    if (dx) {
-      // dx = dout / y
-
-      platform::BinaryMKLDNNHandler<T> handler(
-          dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(),
-          dout, y, dx, 1.0f, 1.0f, 1.0f);
-
-      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
-      const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
-      const auto dst_dx_memory = handler.AcquireDstMemory(dx);
-
-      const auto binary_prim = handler.AcquireForwardPrimitive();
-
-      const std::unordered_map<int, dnnl::memory> args = {
-          {DNNL_ARG_SRC_0, *src_dout_memory},
-          {DNNL_ARG_SRC_1, *src_y_memory},
-          {DNNL_ARG_DST, *dst_dx_memory}};
-
-      binary_prim->execute(astream, args);
-      astream.wait();
-
-      dx->set_layout(framework::DataLayout::kMKLDNN);
-      dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory));
-    }
-
-    if (dy) {
-      // dy = -dout * out / y
-
-      platform::BinaryMKLDNNHandler<T> y_handler(
-          dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(), y,
-          y, nullptr, 1.0f, 1.0f, 1.0f);
-
-      const auto y_memory = y_handler.AcquireSrcMemory(y);
-
-      dnnl::post_ops po;
-      po.append_binary(dnnl::algorithm::binary_div, y_memory->get_desc());
-
-      platform::BinaryMKLDNNHandler<T> handler(
-          dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
-          dout, out, nullptr, -1.0f, 1.0f, 1.0f, po);
-
-      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
-      const auto src_out_memory = handler.AcquireSecondSrcMemory(out);
-
-      // If broadcasting is in use then let's write to temporary
-      // buffer allocated by oneDNN
-      const auto dst_dy_memory = (dout->dims() == dy->dims())
-                                     ? handler.AcquireDstMemory(dy)
-                                     : handler.AcquireDstMemory();
-
-      const auto binary_prim = handler.AcquireForwardPrimitive();
-
-      const std::unordered_map<int, dnnl::memory> args = {
-          {DNNL_ARG_SRC_0, *src_dout_memory},
-          {DNNL_ARG_SRC_1, *src_out_memory},
-          {DNNL_ARG_DST, *dst_dy_memory},
-          {DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, *y_memory}};
-
-      binary_prim->execute(astream, args);
-      astream.wait();
-
-      dy->set_layout(framework::DataLayout::kMKLDNN);
-
-      // Reduction is needed for broadcasting scenario
-      if (dout->dims() != dy->dims()) {
-        platform::ReductionMKLDNNHandler<T> handler_sum(
-            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine,
-            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy));
-        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
-        auto reduction_p = handler_sum.AcquireForwardPrimitive();
-
-        // As source we use mem object with results from binary operation
-        reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory},
-                                       {DNNL_ARG_DST, *dy_memory_p}});
-        astream.wait();
-        dy->set_format(
-            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
-                phi::vectorize<int64_t>(dy->dims()))));
-
-      } else {
-        dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-// TODO(piotrekobi) add int8, uint8 support
-REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace,
-                   ops::EltwiseMKLDNNKernel<float, dnnl::algorithm::binary_div>,
-                   ops::EltwiseMKLDNNKernel<paddle::platform::bfloat16,
-                                            dnnl::algorithm::binary_div>)
-
-REGISTER_OP_KERNEL(elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace,
-                   ops::EltwiseDivMKLDNNGradKernel<paddle::platform::bfloat16>,
-                   ops::EltwiseDivMKLDNNGradKernel<float>)
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace,
+                   ops::EltwiseMKLDNNKernel<float, dnnl::algorithm::binary_div>,
+                   ops::EltwiseMKLDNNKernel<paddle::platform::bfloat16,
+                                            dnnl::algorithm::binary_div>)
+
+REGISTER_OP_KERNEL(
+    elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace,
+    ops::EltwiseMKLDNNGradKernel<paddle::platform::bfloat16,
+                                 dnnl::algorithm::binary_div>,
+    ops::EltwiseMKLDNNGradKernel<float, dnnl::algorithm::binary_div>)
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
index 763fc5f267410..d1a1aa3008c8b 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -15,23 +15,77 @@
 #pragma once
 #include <string>
 #include <unordered_map>
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
 #include "paddle/fluid/framework/data_layout_transform.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace operators {
 
-using framework::DataLayout;
-using framework::Tensor;
 using dnnl::memory;
 using dnnl::primitive;
 using dnnl::stream;
+using framework::DataLayout;
+using framework::Tensor;
+
+inline std::vector<int64_t> CalculateBroadcastedDims(const Tensor* x,
+                                                     const Tensor* y) {
+  const auto src_tz = phi::vectorize(x->dims());
+  const auto dst_tz = phi::vectorize(y->dims());
+
+  size_t j = 0;
+  std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
+  for (size_t i = 0; i < src_tz.size(); ++i) {
+    dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
+    if (j == dst_tz.size()) break;
+  }
+
+  return dst_tz_ex;
+}
 
 template <typename T, dnnl::algorithm BINARY_OP>
 class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
+ private:
+  dnnl::post_ops get_post_ops(const framework::ExecutionContext& ctx) const {
+    dnnl::post_ops post_operations;
+    if (ctx.HasAttr("activation_type")) {
+      const float scale = ctx.HasAttr("activation_scale")
+                              ? ctx.Attr<float>("activation_scale")
+                              : 1.0f;
+      const float alpha = ctx.HasAttr("activation_alpha")
+                              ? ctx.Attr<float>("activation_alpha")
+                              : 0.0f;
+      const float beta = ctx.HasAttr("activation_beta")
+                             ? ctx.Attr<float>("activation_beta")
+                             : 0.0f;
+
+      static std::unordered_map<std::string, dnnl::algorithm> algo_map = {
+          {"relu", dnnl::algorithm::eltwise_relu},
+          {"tanh", dnnl::algorithm::eltwise_tanh},
+          {"leaky_relu", dnnl::algorithm::eltwise_relu},
+          {"swish", dnnl::algorithm::eltwise_swish},
+          {"hardswish", dnnl::algorithm::eltwise_hardswish},
+          {"sqrt", dnnl::algorithm::eltwise_sqrt},
+          {"abs", dnnl::algorithm::eltwise_abs},
+          {"clip", dnnl::algorithm::eltwise_clip},
+          {"gelu", dnnl::algorithm::eltwise_gelu_erf},
+          {"gelu_tanh", dnnl::algorithm::eltwise_gelu_tanh},
+          {"relu6", dnnl::algorithm::eltwise_bounded_relu},
+          {"sigmoid", dnnl::algorithm::eltwise_logistic}};
+
+      const auto& activation_type =
+          algo_map.find(ctx.Attr<std::string>("activation_type"));
+
+      if (activation_type != algo_map.end()) {
+        post_operations.append_eltwise(scale, activation_type->second, alpha,
+                                       beta);
+      }
+    }
+    return post_operations;
+  }
+
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto& dev_ctx =
@@ -47,9 +101,9 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
     float scale_o = ctx.Attr<float>("Scale_out");
     int axis = ctx.Attr<int>("axis");
 
-    platform::BinaryMKLDNNHandler<T> handler(BINARY_OP, axis, mkldnn_engine,
-                                             ctx.GetPlace(), x, y, z, scale_x,
-                                             scale_y, scale_o);
+    platform::BinaryMKLDNNHandler<T> handler(
+        BINARY_OP, axis, mkldnn_engine, ctx.GetPlace(), x, y, z, scale_x,
+        scale_y, scale_o, get_post_ops(ctx));
 
     const auto src_x_memory = handler.AcquireSrcMemory(x);
     const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
@@ -64,7 +118,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
     // operation.
     const bool reuse_x_memopry =
         x->numel() == z->numel() && x->IsSharedBufferWith(*z);
-    std::shared_ptr<dnnl::memory> dst_memory = nullptr;
+    std::shared_ptr<dnnl::memory> dst_memory;
     if (reuse_x_memopry) {
       dst_memory = src_x_memory;
       // NOTE(chenfeiyu): when the output reuses memory from other tensor rather
@@ -96,19 +150,187 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
   }
 };
 
-inline std::vector<int64_t> CalculateBroadcastedDims(const Tensor* x,
-                                                     const Tensor* y) {
-  const auto src_tz = phi::vectorize(x->dims());
-  const auto dst_tz = phi::vectorize(y->dims());
+template <typename T, dnnl::algorithm BINARY_OP>
+class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
+    using Tensor = framework::Tensor;
 
-  size_t j = 0;
-  std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
-  for (size_t i = 0; i < src_tz.size(); ++i) {
-    dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
-    if (j == dst_tz.size()) break;
-  }
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
 
-  return dst_tz_ex;
-}
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    int axis = ctx.Attr<int>("axis");
+
+    auto tz = phi::vectorize<int64_t>(dout->dims());
+    auto proto_type_dout = framework::TransToProtoVarType(dout->dtype());
+
+    platform::ReorderMKLDNNHandler reorder_handler(
+        tz, proto_type_dout, framework::ToMKLDNNDataType(proto_type_dout),
+        onednn_engine);
+
+    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+        dout->format(), platform::to_void_cast(dout->data<T>()));
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
+    if (dx) {
+      std::shared_ptr<dnnl::memory> dst_memory;
+
+      // elementwise_add & elementwise_sub
+      if (BINARY_OP == dnnl::algorithm::binary_add ||
+          BINARY_OP == dnnl::algorithm::binary_sub) {
+        dst_memory = reorder_handler.AcquireDstMemory(dx, dout->format(),
+                                                      ctx.GetPlace());
+        auto reorder_p =
+            reorder_handler.AcquireReorder(dst_memory, reorder_src_memory_p);
+        platform::RecordEvent record_reorder(
+            "int_reorder", platform::TracerEventType::UserDefined, 2,
+            platform::EventRole::kUniqueOp);
+
+        reorder_p->execute(astream, *reorder_src_memory_p, *dst_memory);
+      } else {  // elementwise_mul & elementwise_div
+        platform::BinaryMKLDNNHandler<T> binary_handler(
+            BINARY_OP, axis, onednn_engine, ctx.GetPlace(), dout, y, dx, 1.0f,
+            1.0f, 1.0f);
+
+        const auto src_dout_memory = binary_handler.AcquireSrcMemory(dout);
+        const auto src_y_memory = binary_handler.AcquireSecondSrcMemory(y);
+        dst_memory = binary_handler.AcquireDstMemory(dx);
+
+        const auto binary_prim = binary_handler.AcquireForwardPrimitive();
+
+        const std::unordered_map<int, dnnl::memory> args = {
+            {DNNL_ARG_SRC_0, *src_dout_memory},
+            {DNNL_ARG_SRC_1, *src_y_memory},
+            {DNNL_ARG_DST, *dst_memory}};
+
+        binary_prim->execute(astream, args);
+      }
+      astream.wait();
+
+      dx->set_layout(framework::DataLayout::kMKLDNN);
+      dx->set_format(platform::GetMKLDNNFormat(*dst_memory));
+    }
+
+    if (dy) {
+      dnnl::primitive_attr broadcast_reduction_attr;
+      std::shared_ptr<dnnl::memory> broadcast_src_memory;
+      std::shared_ptr<dnnl::memory> dst_memory;
+
+      // elementwise_add & elementwise_sub
+      if (BINARY_OP == dnnl::algorithm::binary_add ||
+          BINARY_OP == dnnl::algorithm::binary_sub) {
+        if (dout->dims() == dy->dims()) {
+          auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+              dy, dout->format(), ctx.GetPlace());
+
+          dnnl::primitive_attr reorder_attr;
+          std::vector<float> scales(1);
+          scales[0] = (BINARY_OP == dnnl::algorithm::binary_add) ? 1 : -1;
+          reorder_attr.set_output_scales(0, scales);
+          auto reorder_p = std::make_shared<dnnl::reorder>(
+              *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr);
+          platform::RecordEvent record_reorder(
+              "int_reorder", platform::TracerEventType::UserDefined, 2,
+              platform::EventRole::kUniqueOp);
+          reorder_p->execute(astream, *reorder_src_memory_p,
+                             *reorder_dst_memory_p);
+
+          dst_memory = reorder_dst_memory_p;
+        } else {
+          broadcast_src_memory = reorder_src_memory_p;
+        }
+      } else {  // elementwise_mul & elementwise_div
+        std::unordered_map<int, dnnl::memory> args;
+        std::shared_ptr<dnnl::binary> binary_prim;
+        std::shared_ptr<dnnl::memory> post_op_memory;
+        std::shared_ptr<dnnl::memory> src_0_memory;
+        std::shared_ptr<dnnl::memory> src_1_memory;
+
+        platform::BinaryMKLDNNHandler<T> binary_handler(
+            dnnl::algorithm::binary_mul, axis, onednn_engine, ctx.GetPlace(),
+            dout, x, nullptr, 1.0f, 1.0f, 1.0f);
+
+        src_1_memory = binary_handler.AcquireSecondSrcMemory(x);
+
+        if (BINARY_OP == dnnl::algorithm::binary_div) {
+          platform::BinaryMKLDNNHandler<T> post_op_binary_handler(
+              dnnl::algorithm::binary_div, axis, onednn_engine, ctx.GetPlace(),
+              y, y, nullptr, 1.0f, 1.0f, 1.0f);
+
+          post_op_memory = post_op_binary_handler.AcquireSrcMemory(y);
+
+          dnnl::post_ops po;
+          po.append_binary(dnnl::algorithm::binary_div,
+                           post_op_memory->get_desc());
+
+          binary_handler = platform::BinaryMKLDNNHandler<T>(
+              dnnl::algorithm::binary_mul, axis, onednn_engine, ctx.GetPlace(),
+              dout, out, nullptr, -1.0f, 1.0f, 1.0f, po);
+
+          src_1_memory = binary_handler.AcquireSecondSrcMemory(out);
+        }
+
+        src_0_memory = binary_handler.AcquireSrcMemory(dout);
+
+        const auto dst_dy_memory = (dout->dims() == dy->dims())
+                                       ? binary_handler.AcquireDstMemory(dy)
+                                       : binary_handler.AcquireDstMemory();
+
+        binary_prim = binary_handler.AcquireForwardPrimitive();
+        args = {{DNNL_ARG_SRC_0, *src_0_memory},
+                {DNNL_ARG_SRC_1, *src_1_memory},
+                {DNNL_ARG_DST, *dst_dy_memory}};
+
+        if (BINARY_OP == dnnl::algorithm::binary_div)
+          args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1,
+                       *post_op_memory});
+
+        binary_prim->execute(astream, args);
+        broadcast_src_memory = dst_dy_memory;
+        dst_memory = dst_dy_memory;
+      }
+      astream.wait();
+      dy->set_layout(DataLayout::kMKLDNN);
+
+      if (dout->dims() != dy->dims()) {
+        // Broadcasting
+        if (BINARY_OP == dnnl::algorithm::binary_sub) {
+          dnnl::post_ops po;
+          po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0);
+          broadcast_reduction_attr.set_post_ops(po);
+        }
+
+        platform::ReductionMKLDNNHandler<T> reduction_handler(
+            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine,
+            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy),
+            broadcast_reduction_attr);
+        dst_memory = reduction_handler.AcquireDstMemory(dy);
+
+        auto reduction_p = reduction_handler.AcquireForwardPrimitive();
+
+        reduction_p->execute(astream, {
+                                          {DNNL_ARG_SRC, *broadcast_src_memory},
+                                          {DNNL_ARG_DST, *dst_memory},
+                                      });
+        astream.wait();
+        dy->set_format(platform::GetMKLDNNFormat(dst_memory->get_desc().reshape(
+            phi::vectorize<int64_t>(dy->dims()))));
+      } else {
+        dy->set_format(platform::GetMKLDNNFormat(*dst_memory));
+      }
+    }
+  }
+};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
index c03794012ff3b..0ef5c5e628ce6 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
@@ -1,127 +1,19 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
 
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-template <typename T>
-class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-
-    if (dx) {
-      // dx = dout*y
-      platform::BinaryMKLDNNHandler<T> handler(
-          dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
-          dout, y, dx, 1.0f, 1.0f, 1.0f);
-
-      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
-      const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
-      const auto dst_dx_memory = handler.AcquireDstMemory(dx);
-
-      const auto binary_prim = handler.AcquireForwardPrimitive();
-
-      const std::unordered_map<int, dnnl::memory> args = {
-          {DNNL_ARG_SRC_0, *src_dout_memory},
-          {DNNL_ARG_SRC_1, *src_y_memory},
-          {DNNL_ARG_DST, *dst_dx_memory}};
-
-      binary_prim->execute(astream, args);
-      astream.wait();
-
-      dx->set_layout(framework::DataLayout::kMKLDNN);
-      dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory));
-    }
-
-    if (dy) {
-      // dy = dout*x
-      // Handler is having nullptr passed instead of output tensor as
-      // we want Dst buffer to be allocated by oneDNN not to use Tensor
-      platform::BinaryMKLDNNHandler<T> handler(
-          dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
-          dout, x, nullptr, 1.0f, 1.0f, 1.0f);
-
-      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
-      const auto src_x_memory = handler.AcquireSecondSrcMemory(x);
-
-      // If broadcasting is in use then let's write to temporary
-      // buffer allocated by oneDNN
-      const auto dst_dy_memory = (dout->dims() == dy->dims())
-                                     ? handler.AcquireDstMemory(dy)
-                                     : handler.AcquireDstMemory();
-
-      const auto binary_prim = handler.AcquireForwardPrimitive();
-
-      const std::unordered_map<int, dnnl::memory> args = {
-          {DNNL_ARG_SRC_0, *src_dout_memory},
-          {DNNL_ARG_SRC_1, *src_x_memory},
-          {DNNL_ARG_DST, *dst_dy_memory}};
-
-      binary_prim->execute(astream, args);
-      astream.wait();
-
-      dy->set_layout(framework::DataLayout::kMKLDNN);
-
-      // Reduction is needed for broadcasting scenario
-      if (dout->dims() != dy->dims()) {
-        platform::ReductionMKLDNNHandler<T> handler_sum(
-            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine,
-            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy));
-        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
-        auto reduction_p = handler_sum.AcquireForwardPrimitive();
-        // As source we use mem object with results from binary operation
-        reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory},
-                                       {DNNL_ARG_DST, *dy_memory_p}});
-        astream.wait();
-        dy->set_format(
-            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
-                phi::vectorize<int64_t>(dy->dims()))));
-
-      } else {
-        dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(
@@ -132,6 +24,8 @@ REGISTER_OP_KERNEL(
     ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_mul>,
     ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_mul>)
 
-REGISTER_OP_KERNEL(elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::EltwiseMulMKLDNNGradKernel<paddle::platform::bfloat16>,
-                   ops::EltwiseMulMKLDNNGradKernel<float>)
+REGISTER_OP_KERNEL(
+    elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace,
+    ops::EltwiseMKLDNNGradKernel<paddle::platform::bfloat16,
+                                 dnnl::algorithm::binary_mul>,
+    ops::EltwiseMKLDNNGradKernel<float, dnnl::algorithm::binary_mul>)
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
index 3c799008a2abc..510373831eb6d 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
@@ -1,5 +1,4 @@
-
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,113 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-template <typename T>
-class EltwiseSubMKLDNNGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& onednn_engine = dev_ctx.GetEngine();
-
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    auto tz = phi::vectorize<int64_t>(dout->dims());
-    memory::data_type dout_type = framework::ToMKLDNNDataType(
-        framework::TransToProtoVarType(dout->dtype()));
-    platform::ReorderMKLDNNHandler handler(
-        tz, framework::TransToProtoVarType(dout->dtype()), dout_type,
-        onednn_engine);
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    auto reorder_src_memory_p = handler.AcquireSrcMemory(
-        dout->format(), platform::to_void_cast(dout->data<T>()));
-
-    if (dx) {
-      auto reorder_dst_memory_p =
-          handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace());
-      auto reorder_p =
-          handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
-      platform::RecordEvent record_reorder(
-          "int_reorder", platform::TracerEventType::UserDefined, 2,
-          platform::EventRole::kUniqueOp);
-
-      reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
-      astream.wait();
-
-      dx->set_layout(DataLayout::kMKLDNN);
-      dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
-    }
-
-    if (dy) {
-      // Direct copy
-      if (dout->dims() == dy->dims()) {
-        auto reorder_dst_memory_p =
-            handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace());
-
-        dnnl::primitive_attr reorder_attr;
-        std::vector<float> scales = {-1};
-        reorder_attr.set_output_scales(0, scales);
-        auto reorder_p = std::make_shared<dnnl::reorder>(
-            *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr);
-        platform::RecordEvent record_reorder(
-            "int_reorder", platform::TracerEventType::UserDefined, 2,
-            platform::EventRole::kUniqueOp);
-        reorder_p->execute(astream, *reorder_src_memory_p,
-                           *reorder_dst_memory_p);
-        astream.wait();
-
-        dy->set_layout(DataLayout::kMKLDNN);
-        dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
-      } else {
-        // Broadcasting
-
-        dnnl::post_ops po;
-        po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0);
-        dnnl::primitive_attr attr;
-        attr.set_post_ops(po);
-
-        platform::ReductionMKLDNNHandler<T> handler_sum(
-            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine,
-            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy), attr);
-
-        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
-        auto reduction_p = handler_sum.AcquireForwardPrimitive();
-
-        reduction_p->execute(astream, {
-                                          {DNNL_ARG_SRC, *reorder_src_memory_p},
-                                          {DNNL_ARG_DST, *dy_memory_p},
-                                      });
-        astream.wait();
-
-        dy->set_layout(DataLayout::kMKLDNN);
-        dy->set_format(
-            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
-                phi::vectorize<int64_t>(dy->dims()))));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
 
 namespace ops = paddle::operators;
 
@@ -131,6 +24,8 @@ REGISTER_OP_KERNEL(
     ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_sub>,
     ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_sub>)
 
-REGISTER_OP_KERNEL(elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::EltwiseSubMKLDNNGradKernel<paddle::platform::bfloat16>,
-                   ops::EltwiseSubMKLDNNGradKernel<float>)
+REGISTER_OP_KERNEL(
+    elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace,
+    ops::EltwiseMKLDNNGradKernel<paddle::platform::bfloat16,
+                                 dnnl::algorithm::binary_sub>,
+    ops::EltwiseMKLDNNGradKernel<float, dnnl::algorithm::binary_sub>)
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
index 5222103256d61..ea009a38056f0 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
@@ -17,8 +17,13 @@
 #include "paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add);
+PD_DECLARE_KERNEL(add_double_grad, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(add_double_grad, GPU, ALL_LAYOUT);
+#endif
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
index 9d4d11609ac20..ce5c6b701d958 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
@@ -21,9 +21,12 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add);
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
index 7890d634e9941..3cecc52a3c481 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
@@ -27,9 +27,15 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_div);
 
+PD_DECLARE_KERNEL(divide_double_grad, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(divide_double_grad, GPU, ALL_LAYOUT);
+#endif
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/erf_op.cc b/paddle/fluid/operators/erf_op.cc
index f68f670394871..64274d098c058 100644
--- a/paddle/fluid/operators/erf_op.cc
+++ b/paddle/fluid/operators/erf_op.cc
@@ -16,8 +16,10 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 
-#include "paddle/fluid/operators/erf_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -29,18 +31,6 @@ class ErfOp : public framework::OperatorWithKernel {
         const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(%s) of ErfOp should not be null.", "X"));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(%s) of ErfOp should not be null.", "Out"));
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -116,28 +106,10 @@ class ErfGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(erf, ErfInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(erf, ops::ErfOp, ops::ErfOpMaker,
                   ops::ErfGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ErfGradOpMaker<paddle::imperative::OpBase>);
+                  ops::ErfGradOpMaker<paddle::imperative::OpBase>,
+                  ErfInferShapeFunctor);
 REGISTER_OPERATOR(erf_grad, ops::ErfGradOp);
-REGISTER_OP_CPU_KERNEL(
-    erf, ops::ErfKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ErfKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ErfKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::float16>);
-REGISTER_OP_CPU_KERNEL(
-    erf_grad, ops::ErfGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ErfGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ErfGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    erf, ops::ErfKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ErfKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ErfKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    erf_grad, ops::ErfGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ErfGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ErfGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::float16>);
diff --git a/paddle/fluid/operators/erf_op.h b/paddle/fluid/operators/erf_op.h
deleted file mode 100644
index 4780b2e7f5b28..0000000000000
--- a/paddle/fluid/operators/erf_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifndef _USE_MATH_DEFINES
-#define _USE_MATH_DEFINES
-#endif
-#include <cmath>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ErfKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    out->mutable_data<T>(in->place());
-
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenErf<std::decay_t<decltype(place)>, T>::Eval(place, eigen_out,
-                                                     eigen_in);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ErfGradKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* dout =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    dx->mutable_data<T>(dout->place());
-
-    auto eigen_x = framework::EigenVector<T>::Flatten(*x);
-    auto eigen_dout = framework::EigenVector<T>::Flatten(*dout);
-    auto eigen_dx = framework::EigenVector<T>::Flatten(*dx);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenErfGrad<std::decay_t<decltype(place)>, T>::Eval(place, eigen_dx,
-                                                         eigen_x, eigen_dout);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
index 119e514a49e28..9361edd43bf15 100755
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -12,7 +12,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/expand_as_v2_op.h"
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,27 +24,6 @@ using framework::Tensor;
 class ExpandAsV2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandAsV2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ExpandAsV2");
-    auto x_dims = ctx->GetInputDim("X");
-    auto target_shape = ctx->Attrs().Get<std::vector<int>>("target_shape");
-    PADDLE_ENFORCE_GE(
-        target_shape.size(), static_cast<size_t>(x_dims.size()),
-        platform::errors::InvalidArgument(
-            "The rank of target_shape must be greater than or equal "
-            "to the rank of Input(X). But received Input(X): input "
-            "rank %u; received target_shape: rank %u.",
-            x_dims.size(), target_shape.size()));
-    PADDLE_ENFORCE_LE(target_shape.size(), MAX_RANK_SUPPORTED,
-                      platform::errors::InvalidArgument(
-                          "The rank of target_shape must be less than or equal "
-                          "to %d. But received: rank %u.",
-                          MAX_RANK_SUPPORTED, target_shape.size()));
-    ctx->SetOutputDim("Out", phi::make_ddim(target_shape));
-  }
 };
 
 class ExpandAsV2OpMaker : public framework::OpProtoAndCheckerMaker {
@@ -116,42 +97,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandAsV2GradNoNeedBufVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(expand_as_v2, ExpandAsInferShapeFunctor,
+                            PD_INFER_META(phi::ExpandAsInferMeta));
 REGISTER_OPERATOR(expand_as_v2, ops::ExpandAsV2Op, ops::ExpandAsV2OpMaker,
                   ops::ExpandAsV2GradOpMaker<paddle::framework::OpDesc>,
-                  ops::ExpandAsV2GradOpMaker<paddle::imperative::OpBase>);
+                  ops::ExpandAsV2GradOpMaker<paddle::imperative::OpBase>,
+                  ExpandAsInferShapeFunctor);
 REGISTER_OPERATOR(expand_as_v2_grad, ops::ExpandAsV2GradOp,
                   ops::ExpandAsV2GradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    expand_as_v2,
-    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, bool>);
-REGISTER_OP_CPU_KERNEL(
-    expand_as_v2_grad,
-    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, double>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL(
-    expand_as_v2,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    expand_as_v2_grad,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, double>);
-#endif
 
 REGISTER_OP_VERSION(expand_as_v2)
     .AddCheckpoint(
         R"ROC(fix expand_as_v2 and add new input [Y])ROC",
         paddle::framework::compatible::OpVersionDesc().NewInput(
-            "Y", "Expand X according to the shape of Y"));
\ No newline at end of file
+            "Y", "Expand X according to the shape of Y"));
diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h
index d7560efc5c1f1..f09e7764eed39 100755
--- a/paddle/fluid/operators/expand_as_v2_op.h
+++ b/paddle/fluid/operators/expand_as_v2_op.h
@@ -32,219 +32,5 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 
-template <typename DeviceContext, typename T>
-class ExpandAsV2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<Tensor>("X")->dims().size();
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto target_rank = target_shape.size();
-    PADDLE_ENFORCE_GE(target_rank, rank,
-                      platform::errors::InvalidArgument(
-                          "The rank (%d) of the input 'target_tensor' for "
-                          "expand_as_v2 op must be greater than or equal to "
-                          "the rank (%d) of the input 'x'.",
-                          target_rank, rank));
-    PADDLE_ENFORCE_GE(rank, 1, platform::errors::InvalidArgument(
-                                   "The rank (%d) of the input 'x' for "
-                                   "expand_as_v2 op must be positive.",
-                                   rank));
-    PADDLE_ENFORCE_LE(target_rank, MAX_RANK_SUPPORTED,
-                      platform::errors::InvalidArgument(
-                          "The rank (%d) of the input 'target_tensor' for "
-                          "expand_as_v2 op must be less than or equal to %d.",
-                          target_rank, MAX_RANK_SUPPORTED));
-
-    switch (target_rank) {
-      case 1:
-        ExpandAs<1>(context);
-        break;
-      case 2:
-        ExpandAs<2>(context);
-        break;
-      case 3:
-        ExpandAs<3>(context);
-        break;
-      case 4:
-        ExpandAs<4>(context);
-        break;
-      case 5:
-        ExpandAs<5>(context);
-        break;
-      case 6:
-        ExpandAs<6>(context);
-        break;
-    }
-  }
-
- protected:
-  template <int Rank>
-  void ExpandAs(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<Tensor>("X");
-    auto in_dims = in0->dims();
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    auto diff = target_shape.size() - vec_in_dims.size();
-    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    std::vector<int> repeat_times(vec_in_dims.size());
-    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-      PADDLE_ENFORCE_NE(target_shape[i], 0,
-                        platform::errors::InvalidArgument(
-                            "The value of target shape cannot be zero."));
-      if (i < diff) {
-        PADDLE_ENFORCE_GT(
-            target_shape[i], 0,
-            platform::errors::InvalidArgument(
-                "The expanded size (%d) for non-existing dimensions must be "
-                "positive for expand_as_v2 op.",
-                target_shape[i]));
-        repeat_times[i] = target_shape[i];
-      } else if (target_shape[i] > 0) {
-        if (vec_in_dims[i] != 1) {
-          PADDLE_ENFORCE_EQ(
-              vec_in_dims[i], target_shape[i],
-              platform::errors::InvalidArgument(
-                  "The value (%d) of the non-singleton dimension does not match"
-                  " the corresponding value (%d) in shape for expand_as_v2 op.",
-                  vec_in_dims[i], target_shape[i]));
-          repeat_times[i] = 1;
-        } else {
-          repeat_times[i] = target_shape[i];
-        }
-      } else {
-        PADDLE_ENFORCE_EQ(
-            target_shape[i], -1,
-            platform::errors::InvalidArgument(
-                "When the value in shape is negative for expand_as_v2 op, "
-                "only -1 is supported, but the value received is %d.",
-                target_shape[i]));
-        repeat_times[i] = 1;
-      }
-    }
-    auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      bcast_dims[i] = repeat_times[i];
-    }
-
-    framework::DDim new_in_dims = phi::make_ddim(vec_in_dims);
-    framework::DDim out_dims = phi::make_ddim(target_shape);
-
-    out0->Resize(out_dims);
-    auto x = EigenTensor<T, Rank>::From(*in0, new_in_dims);
-    out0->mutable_data<T>(context.GetPlace());
-    auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
-                                                                 bcast_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ExpandAsV2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto x_dims = in0->dims();
-    auto vec_in_dims = phi::vectorize<int>(x_dims);
-    auto diff = target_shape.size() - vec_in_dims.size();
-    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    std::vector<int> repeat_times(vec_in_dims.size());
-    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-      repeat_times[i] = target_shape[i] / vec_in_dims[i];
-    }
-    std::vector<int> reshape_dims_vec;
-    std::vector<int> reduce_dims_vec;
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      reduce_dims_vec.push_back(reshape_dims_vec.size());
-      reshape_dims_vec.push_back(repeat_times[i]);
-      reshape_dims_vec.push_back(vec_in_dims[i]);
-    }
-
-    int dims = reduce_dims_vec.size();
-    bool just_copy = true;
-    for (size_t i = 0; i < repeat_times.size(); i++) {
-      if (repeat_times[i] != 1) {
-        just_copy = false;
-        break;
-      }
-    }
-    // no need reduce, just copy
-    if (just_copy) {
-      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-      out0->mutable_data<T>(context.GetPlace());
-      framework::TensorCopy(*in0, context.GetPlace(), context.device_context(),
-                            out0);
-    } else {
-      PADDLE_ENFORCE_GE(dims, 1,
-                        platform::errors::InvalidArgument(
-                            "The rank of the input 'Out@GRAD' for "
-                            "expand_as_v2_grad op must be greater than or "
-                            "equal to 1, but the value received is %d.",
-                            dims));
-      PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED,
-                        platform::errors::InvalidArgument(
-                            "The rank of the input 'Out@GRAD' for "
-                            "expand_as_v2_grad op must be less than or equal "
-                            "to %d, but the value received is %d.",
-                            MAX_RANK_SUPPORTED, dims));
-      switch (dims) {
-        case 1:
-          ExpandAsBackward<1>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 2:
-          ExpandAsBackward<2>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 3:
-          ExpandAsBackward<3>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 4:
-          ExpandAsBackward<4>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 5:
-          ExpandAsBackward<5>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 6:
-          ExpandAsBackward<6>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Only support tensor with rank being between 1 and 6. But "
-              "received tensor's rank = %d.",
-              dims));
-      }
-    }
-  }
-
- protected:
-  template <int Dims>
-  void ExpandAsBackward(const framework::ExecutionContext& context,
-                        const std::vector<int>& reshape_dims_vec,
-                        const std::vector<int>& reduce_dims_vec) const {
-    size_t reshape_size = reshape_dims_vec.size();
-    size_t reduce_size = reduce_dims_vec.size();
-    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-    out0->mutable_data<T>(context.GetPlace());
-    auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
-    for (size_t i = 0; i < reshape_size; ++i) {
-      reshape_dims[i] = reshape_dims_vec[i];
-    }
-    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
-    for (size_t i = 0; i < reduce_size; ++i) {
-      reduce_dims[i] = reduce_dims_vec[i];
-    }
-    auto out_grad = EigenVector<T>::Flatten(*in0);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
-        place, x_grad, out_grad, reduce_dims, reshape_dims);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc
index cdd4e1dbaae6a..df00ae54c1036 100644
--- a/paddle/fluid/operators/expand_op_npu_test.cc
+++ b/paddle/fluid/operators/expand_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 9f7e4fb8d5749..70597be393c35 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -28,13 +28,14 @@ __global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {
   extern __shared__ char* shared_max_data_tmp[];
   auto shared_max_data = reinterpret_cast<T*>(shared_max_data_tmp);
   if (gridDim.x > 1) {
-    shared_max_data[tid] = T(0);
+    T local_max_data = T(0);
     for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
       T tmp = abs(in[i]);
-      if (tmp > shared_max_data[tid]) {
-        shared_max_data[tid] = tmp;
+      if (tmp > local_max_data) {
+        local_max_data = tmp;
       }
     }
+    shared_max_data[tid] = local_max_data;
   } else {
     if (bid < n) {
       shared_max_data[tid] = abs(in[bid]);
@@ -83,13 +84,14 @@ __global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n,
   int channel_size = n / c;
   const T* in_c = in + blockIdx.x * channel_size;
   extern __shared__ T shared_max_data[];
-  shared_max_data[tid] = T(0);
+  T local_max_data = T(0);
   for (int i = tid; i < channel_size; i += blockDim.x) {
     T tmp = fabs(in_c[i]);
-    if (tmp > shared_max_data[tid]) {
-      shared_max_data[tid] = tmp;
+    if (tmp > local_max_data) {
+      local_max_data = tmp;
     }
   }
+  shared_max_data[tid] = local_max_data;
   __syncthreads();
   for (int i = blockDim.x / 2; i > 0; i >>= 1) {
     if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
@@ -113,13 +115,14 @@ __global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n,
   int tid = threadIdx.x;
   int bid = blockIdx.x;
   const T* in_current = in + tid * cout_wh_size + bid * wh_size;
-  shared_max_data[tid] = T(0);
+  T local_max_data = T(0);
   for (int i = 0; i < wh_size; i++) {
     T tmp = fabs(in_current[i]);
-    if (tmp > shared_max_data[tid]) {
-      shared_max_data[tid] = tmp;
+    if (tmp > local_max_data) {
+      local_max_data = tmp;
     }
   }
+  shared_max_data[tid] = local_max_data;
   __syncthreads();
 
   int len = blockDim.x;
@@ -404,6 +407,19 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
+template <typename T>
+__global__ void FindMovingAverageAbsMaxKernel(const T* in_state,
+                                              const T* in_accum,
+                                              const T* cur_scale, const T rate,
+                                              T* out_state, T* out_accum,
+                                              T* out_scale) {
+  T state = rate * (*in_state) + T(1.0f);
+  T accum = rate * (*in_accum) + (*cur_scale);
+  *out_state = state;
+  *out_accum = accum;
+  *out_scale = accum / state;
+}
+
 template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
 
 template <typename T>
@@ -415,29 +431,14 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
                   framework::Tensor* out_accum, framework::Tensor* out_scale) {
     const auto gpu_place = ctx.GetPlace();
 
-    T accum;
-    T state;
-    T scale;
-    memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data<T>(),
-                 sizeof(T), ctx.stream());
-    memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data<T>(),
-                 sizeof(T), ctx.stream());
-    memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T),
-                 ctx.stream());
-    ctx.Wait();
-
     T rate_t = static_cast<T>(rate);
-    state = rate_t * state + static_cast<T>(1.0);
-    accum = rate_t * accum + scale;
-    scale = accum / state;
-
-    memory::Copy(gpu_place, out_accum->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &accum, sizeof(T), ctx.stream());
-    memory::Copy(gpu_place, out_state->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &state, sizeof(T), ctx.stream());
-    memory::Copy(gpu_place, out_scale->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &scale, sizeof(T), ctx.stream());
-    ctx.Wait();
+    T* out_state_data = out_state->mutable_data<T>(gpu_place);
+    T* out_accum_data = out_accum->mutable_data<T>(gpu_place);
+    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
+
+    FindMovingAverageAbsMaxKernel<T><<<1, 1, 0, ctx.stream()>>>(
+        in_state.data<T>(), in_accum.data<T>(), cur_scale, rate_t,
+        out_state_data, out_accum_data, out_scale_data);
   }
 };
 
diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu
index 0eb84f18f25f0..27a235765227f 100644
--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/fused/attn_feed_forward.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace framework = paddle::framework;
@@ -29,6 +30,11 @@ namespace platform = paddle::platform;
 USE_OP(matmul);
 USE_OP_ITSELF(elementwise_add);
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+#endif
+
 // get paddle matmul op results as baseline
 template <typename T>
 void GetLinearOp(const std::vector<T> &x, const std::vector<T> &y,
diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc
index 79018f2a97448..cb03add314327 100644
--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
@@ -65,7 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {
       tensor_value.mutable_data<T>({1}, ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&tensor_value, value);
       NpuOpRunner runner;
-#if (CANN_VERSION_CODE >= 503003)
+#if (CANN_VERSION_CODE >= 503003 && CANN_VERSION_CODE < 504001)
       runner.SetType("FillD")
           .AddInput(tensor_value)
           .AddOutput(*out_var)
diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu
new file mode 100644
index 0000000000000..7870efba4e7a1
--- /dev/null
+++ b/paddle/fluid/operators/filter_by_instag_op.cu
@@ -0,0 +1,597 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11000
+
+#if defined(PADDLE_WITH_CUDA)
+#include <cooperative_groups.h>
+#endif
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <cstring>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/fluid/operators/filter_by_instag_op.h"
+
+#if defined(PADDLE_WITH_CUDA)
+namespace cg = cooperative_groups;
+#endif
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using SelectedRows = phi::SelectedRows;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+using Vector = framework::Vector<T>;
+
+#define WARP_SIZE 32
+#define MAX_WARP_NUM 32
+
+#if defined(PADDLE_WITH_CUDA)
+
+template <typename T>
+__global__ void filter_copy_fuse_kernel(
+    const size_t N, const int ins_per_thread, size_t* x1_lods_data,
+    size_t* x2_lods_data, const int64_t* x2_data, const int64_t* x3_data,
+    int64_t filter_tag_size, T* out_data, int64_t* map_data,
+    size_t* map_lods_data, size_t* out_lods_data, size_t* out_idx_data,
+    const T* x1_data, int x1_embed_size, float* loss_weight_data,
+    float fill_value) {
+  // N is instance num
+  // one threads for ins_per_thread instances
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  cg::thread_block b = cg::this_thread_block();
+  cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+  int gid = idx / WARP_SIZE;
+
+  // general use
+  int thread_num =
+      (N + (ins_per_thread - 1)) / ins_per_thread;  // real thread num
+  int total_warp_num = thread_num / WARP_SIZE;      // 30
+  int remain_thread_num = thread_num % WARP_SIZE;   // 16
+
+  int warp_thread_num = -1;
+  if (gid < total_warp_num) {
+    warp_thread_num = WARP_SIZE;
+  } else {
+    warp_thread_num = remain_thread_num;
+  }
+
+  int group_num = total_warp_num;
+  if (remain_thread_num > 0) {
+    group_num = total_warp_num + 1;
+  }
+
+  if (gid >= group_num) return;
+
+  int ins_start = idx * ins_per_thread;
+  int ins_end = (idx + 1) * ins_per_thread;
+
+  if (N < ins_end) ins_end = N;
+
+  int flag_data[5];
+  int prefix_sum_data[5];
+  int prefix_sum_data2[5];
+
+  __shared__ int shr[MAX_WARP_NUM];
+  __shared__ int shr2[MAX_WARP_NUM];
+  __shared__ int shr3[MAX_WARP_NUM];
+
+  for (int p = ins_start; p < ins_end; p++) {
+    int ins_tag_start = x2_lods_data[p];
+    int ins_tag_end = x2_lods_data[p + 1];
+    flag_data[p - ins_start] = 0;
+    // filter logic
+    int i = ins_tag_start;
+    for (; i < ins_tag_end; i++) {
+      int64_t ins_tag = x2_data[i];
+      int j = 0;
+      for (; j < filter_tag_size; j++) {
+        if (x3_data[j] == ins_tag) break;
+      }
+      // if ins_tag in filter tag
+      if (j < filter_tag_size) {
+        flag_data[p - ins_start] = 1;
+        break;
+      }
+    }
+  }
+
+  int sum_addr = 0;
+  int sum_flag = 0;
+  int sum_out_lods = 0;
+
+  int local_addr = 0;
+  int local_flag = 0;
+  int local_out_lods = 0;
+
+  if (ins_start < ins_end) {
+    for (int p = ins_start; p < ins_end; p++) {
+      int previous = -1;
+      if (p == ins_start) {
+        previous = 0;
+      } else {
+        previous = prefix_sum_data[p - ins_start - 1];
+      }
+
+      prefix_sum_data[p - ins_start] =
+          previous +
+          flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]);
+    }
+
+    local_addr = prefix_sum_data[ins_end - 1 - ins_start];
+    sum_addr = local_addr;
+
+    for (int p = ins_start; p < ins_end; p++) {
+      local_flag += flag_data[p - ins_start];
+    }
+    sum_flag = local_flag;
+
+    for (int p = ins_start; p < ins_end; p++) {
+      local_out_lods +=
+          flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]);
+    }
+
+    sum_out_lods = local_out_lods;
+  }
+
+  for (int i = 1; i < warp_thread_num; i *= 2) {
+    int temp_addr = g.shfl_up(sum_addr, i);
+    int temp_flag = g.shfl_up(sum_flag, i);
+    int temp_out_lods = g.shfl_up(sum_out_lods, i);
+
+    if (g.thread_rank() >= i) {
+      sum_addr += temp_addr;
+      sum_flag += temp_flag;
+      sum_out_lods += temp_out_lods;
+    }
+  }
+
+  if (g.thread_rank() == warp_thread_num - 1) {
+    shr[gid] = sum_addr;
+    shr2[gid] = sum_flag;
+    shr3[gid] = sum_out_lods;
+  }
+
+  b.sync();
+
+  int sum_addr2 = 0;
+  int sum_flag2 = 0;
+  int sum_out_lods2 = 0;
+
+  // communicate between warp
+  if (g.thread_rank() < group_num) {
+    sum_addr2 = shr[g.thread_rank()];
+    sum_flag2 = shr2[g.thread_rank()];
+    sum_out_lods2 = shr3[g.thread_rank()];
+  }
+
+  for (int i = 1; i < group_num; i *= 2) {
+    int temp_addr2 = g.shfl_up(sum_addr2, i);
+    int temp_flag2 = g.shfl_up(sum_flag2, i);
+    int temp_out_lods2 = g.shfl_up(sum_out_lods2, i);
+
+    if (g.thread_rank() >= i) {
+      sum_addr2 += temp_addr2;
+      sum_flag2 += temp_flag2;
+      sum_out_lods2 += temp_out_lods2;
+    }
+  }
+
+  int sum_addr3 = g.shfl(sum_addr2, gid);
+  int sum_flag3 = g.shfl(sum_flag2, gid);
+  int sum_out_lods3 = g.shfl(sum_out_lods2, gid);
+
+  int p_flag;
+  int p_addr;
+  int p_out_lods;
+
+  if (ins_start < ins_end) {
+    p_addr = sum_addr3 - shr[gid] + sum_addr - local_addr;
+    p_flag = sum_flag3 - shr2[gid] + sum_flag - local_flag;
+    p_out_lods = sum_out_lods3 - shr3[gid] + sum_out_lods - local_out_lods;
+
+    for (int p = ins_start; p < ins_end; p++) {
+      if (ins_start == p) {
+        prefix_sum_data2[p - ins_start] = p_addr;
+      } else {
+        prefix_sum_data2[p - ins_start] =
+            prefix_sum_data2[p - ins_start - 1] +
+            flag_data[p - ins_start - 1] *
+                (x1_lods_data[p] - x1_lods_data[p - 1]);
+      }
+    }
+
+    if (gid == 0 && g.thread_rank() == group_num - 1) {
+      *out_idx_data = (sum_flag2 + 1);
+      map_lods_data[sum_flag2] = sum_flag2;
+    }
+  }
+
+  int sum_out_lods4 = g.shfl(sum_out_lods2 + 1, group_num - 1);
+
+  if (ins_start < ins_end) {
+    int out_lods_idx = p_flag + 1;
+    for (int p = ins_start; p < ins_end; p++) {
+      if (flag_data[p - ins_start] == 1) {
+        size_t batch_len = x1_lods_data[p + 1] - x1_lods_data[p];
+        int t = out_lods_idx - 1;
+        int previous;
+        if (out_lods_idx == p_flag + 1) {
+          previous = p_out_lods;
+        } else {
+          previous = out_lods_data[t];
+        }
+        map_data[t * 3] = (int64_t)previous;
+        map_data[t * 3 + 1] = x1_lods_data[p];
+        map_lods_data[t] = t;
+        out_lods_data[out_lods_idx] = previous + batch_len;
+        map_data[t * 3 + 2] = batch_len;
+        out_lods_idx++;
+      }
+    }
+
+    // fill loss_weight_data
+    if (sum_out_lods4 > 1) {
+      int out_data_num = sum_out_lods4 - 1;
+      int out_start = ins_start;
+      if (out_start < out_data_num) {
+        int out_end = ins_end >= out_data_num ? out_data_num : ins_end;
+        for (int p = out_start; p < out_end; p++) {
+          loss_weight_data[p] = fill_value;
+        }
+      }
+    }
+
+    for (int p = ins_start; p < ins_end; p++) {
+      // copy logic
+      if (flag_data[p - ins_start] == 1) {
+        auto output_start_idx = prefix_sum_data2[p - ins_start];
+        T* dst = out_data + output_start_idx * x1_embed_size;
+        const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size;
+        const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size;
+        for (const T *j = src_start; j != src_end; dst++, j++) {
+          *dst = *j;
+        }
+      }
+    }
+  }
+
+  b.sync();
+}
+
+template <typename T>
+__global__ void copy_grad_kernel(const size_t N, const int ins_per_thread,
+                                 const T* out_grad_data, T* x1_grad_data,
+                                 const int64_t* map_data, int x1_embed_size) {
+  // N is instance num
+  // one threads for one instance
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int ins_start = idx * ins_per_thread;
+  int ins_end = (idx + 1) * ins_per_thread;
+  if (ins_start >= N) {
+    return;
+  }
+  if (ins_end > N) ins_end = N;
+  for (int p = ins_start; p < ins_end; p++) {
+    T* dst = x1_grad_data + map_data[p * 3 + 1] * x1_embed_size;
+    const T* src_start = out_grad_data + map_data[p * 3] * x1_embed_size;
+    const T* src_end =
+        out_grad_data + (map_data[p * 3] + map_data[p * 3 + 2]) * x1_embed_size;
+
+    for (const T *j = src_start; j != src_end; dst++, j++) {
+      *dst = *j;
+    }
+  }
+}
+
+#endif
+
+template <typename T>
+class FilterByInstagGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+#if defined(PADDLE_WITH_CUDA)
+
+    auto gpu_place = context.GetPlace();
+
+    gpuStream_t current_stream = context.cuda_device_context().stream();
+
+    int max_thread_num_per_block = 1024;
+    //    context.cuda_device_context().GetMaxThreadsPerBlock();
+    // X1 is global FC output
+    // Dim [batch size, embedding size]
+    const LoDTensor* x1 = context.Input<LoDTensor>("Ins");
+    bool is_lod = context.Attr<bool>("is_lod");
+
+    int is_x1_lod = -1;
+    if (is_lod)
+      is_x1_lod = 1;
+    else
+      is_x1_lod = 0;
+
+    int64_t out_val_if_empty = context.Attr<int64_t>("out_val_if_empty");
+    size_t x1_embed_size = x1->dims()[1];
+    // X2 is ins tag list
+    // LoD [[0, Sum(ins1), Sum(ins1, ins2), ... ]]
+    const LoDTensor* x2 = context.Input<LoDTensor>("Ins_tag");
+    // expected auto = const int64_t
+    const int64_t* x2_data = x2->data<int64_t>();
+
+    // X3 is local fc tag list
+    // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]]
+    const Tensor* x3 = context.Input<Tensor>("Filter_tag");
+    const int64_t* x3_data = x3->data<int64_t>();
+
+    Vector<size_t> x2_lods;
+    if (x2->lod().size() != 0) {  // lod_level = 1
+      x2_lods = x2->lod()[0];
+    } else {  // lod_level = 0
+      const size_t x2_lods_size = x2->dims()[0];
+      const size_t instag_per_num = x2->dims()[1];
+      // x2_lods.resize(x2->dims()[0] + 1);
+      // move to cuda
+      x2_lods.push_back(0);
+      for (size_t i = 0; i < x2_lods_size; i++) {
+        x2_lods.push_back(x2_lods.back() + instag_per_num);
+      }
+    }
+
+    const size_t x2_lods_size = x2_lods.size() - 1;
+    paddle::framework::MixVector<size_t> mixv_x2_lods(&x2_lods);
+
+    size_t* x2_lods_data = mixv_x2_lods.CUDAMutableData(gpu_place);
+
+    Vector<size_t> x1_lods;
+    if (!is_x1_lod) {
+      x1_lods.push_back(0);
+      for (int i = 0; i < x1->dims()[0]; i++) {
+        x1_lods.push_back(i + 1);
+      }
+    } else {
+      // x1_lods = context.Input<LoDTensor>("Ins")->lod()[0];
+      // new: lod_level=0 => lod() return {}
+      if (x1->lod().size() != 0) {  // lod_level = 1
+        x1_lods = x1->lod()[0];
+      } else {  // lod_level = 0
+        // x1_lods.resize(x1->dims()[0] + 1);
+        // move to cuda
+        x1_lods.push_back(0);
+        for (int i = 0; i < x1->dims()[0]; i++) {
+          x1_lods.push_back(i + 1);
+        }
+      }
+    }
+
+    paddle::framework::MixVector<size_t> mixv_x1_lods(&x1_lods);
+
+    size_t* x1_lods_data = mixv_x1_lods.CUDAMutableData(gpu_place);
+    auto* x1_data = x1->data<T>();
+
+    // set output value
+    // for those whose ins been dropout, set 0 for whole lines.
+    // otherwise, copy whole line
+    // Dim [local fc count, batch size, embedding size]
+    LoDTensor* out = context.Output<LoDTensor>("Out");
+    LoDTensor* map = context.Output<LoDTensor>("IndexMap");
+    LoDTensor* loss_weight = context.Output<LoDTensor>("LossWeight");
+
+    int out_first = x1_lods.back();
+
+    out->Resize(phi::make_ddim({(int64_t)out_first, (int64_t)x1_embed_size}));
+    map->Resize(phi::make_ddim({(int64_t)x2_lods_size, 3}));
+    loss_weight->Resize(phi::make_ddim({(int64_t)x2_lods_size, 1}));
+
+    T* out_data = out->mutable_data<T>(gpu_place);
+    int64_t* map_data = map->mutable_data<int64_t>(gpu_place);
+    float* loss_weight_data = loss_weight->mutable_data<float>(gpu_place);
+
+    int block_size = max_thread_num_per_block;
+    int ins_per_thread = (x2_lods_size + block_size - 1) / block_size;
+    dim3 block_dim(block_size);
+    dim3 grid_dim(1);
+
+    Vector<size_t> out_lods(x2_lods_size + 1, 0);
+    Vector<size_t> map_lods(x2_lods_size + 1, 0);
+
+    paddle::framework::MixVector<size_t> mixv_out_lods(&out_lods);
+    paddle::framework::MixVector<size_t> mixv_map_lods(&map_lods);
+
+    // thrust::device_vector<size_t> out_idx(1);
+    Vector<size_t> out_idx(1, 0);
+    paddle::framework::MixVector<size_t> mixv_out_idx(&out_idx);
+
+    size_t* out_idx_data = mixv_out_idx.CUDAMutableData(gpu_place);
+    size_t* out_lods_data = mixv_out_lods.CUDAMutableData(gpu_place);
+    size_t* map_lods_data = mixv_map_lods.CUDAMutableData(gpu_place);
+
+    float fill_value = 1.0;
+
+    filter_copy_fuse_kernel<<<grid_dim, block_dim, 0, current_stream>>>(
+        x2_lods_size, ins_per_thread, x1_lods_data, x2_lods_data, x2_data,
+        x3_data, x3->numel(), out_data, map_data, map_lods_data, out_lods_data,
+        out_idx_data, x1_data, x1_embed_size, loss_weight_data, fill_value);
+
+    platform::GpuStreamSync(current_stream);
+
+    mixv_out_lods.resize(mixv_out_idx[0]);
+
+    if (mixv_out_lods.size() - 1 > 0) {
+      out->Resize(phi::make_ddim(
+          {(int64_t)mixv_out_lods.back(), (int64_t)x1_embed_size}));
+
+      map->Resize(phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 3}));
+      loss_weight->Resize(
+          phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 1}));
+
+    } else {
+      out->Resize(phi::make_ddim({1, (int64_t)x1_embed_size}));
+      map->Resize(phi::make_ddim({1, 3}));
+      loss_weight->Resize(phi::make_ddim({1, 1}));
+    }
+
+    if (mixv_out_lods.size() - 1 > 0) {
+      map_lods.resize(mixv_out_lods.size());
+
+      mixv_map_lods.CopyToCPU();
+
+      std::vector<Vector<size_t>> map_lod_info;
+      map_lod_info.emplace_back(map_lods);
+
+      map->set_lod(map_lod_info);
+      loss_weight->set_lod(map_lod_info);
+
+      mixv_out_lods.CopyToCPU();
+      std::vector<Vector<size_t>> out_lod_info;
+      out_lod_info.emplace_back(out_lods);
+      out->set_lod(out_lod_info);
+
+    } else {
+      Vector<size_t> map_lods(2, 0);
+      paddle::framework::MixVector<size_t> mixv_map_lods(&map_lods);
+      thrust::device_ptr<int64_t> map_data_ptr(map_data);
+
+      map_data_ptr[0] = 0;
+      map_data_ptr[1] = 1;
+      map_data_ptr[2] = 1;
+
+      mixv_map_lods[0] = 0;
+      mixv_map_lods[1] = 1;
+      mixv_out_lods.push_back(1);
+
+      mixv_map_lods.CopyToCPU();
+      mixv_out_lods.CopyToCPU();
+
+      std::vector<Vector<size_t>> map_lod_info;
+      map_lod_info.emplace_back(map_lods);
+      map->set_lod(map_lod_info);
+
+      loss_weight->set_lod(map_lod_info);
+
+      std::vector<Vector<size_t>> out_lod_info;
+      out_lod_info.emplace_back(out_lods);
+      out->set_lod(out_lod_info);
+
+      thrust::device_ptr<T> out_data_ptr(out_data);
+
+      // gpu kernel
+      if (std::is_same<T, int32_t>::value) {
+        thrust::fill(out_data_ptr, out_data_ptr + out->numel(),
+                     static_cast<int32_t>(out_val_if_empty));
+      } else if (std::is_same<T, int64_t>::value) {
+        thrust::fill(out_data_ptr, out_data_ptr + out->numel(),
+                     static_cast<int64_t>(out_val_if_empty));
+      } else if (std::is_same<T, float>::value) {
+        thrust::fill(out_data_ptr, out_data_ptr + out->numel(),
+                     static_cast<float>(out_val_if_empty));
+      } else {
+        thrust::fill(out_data_ptr, out_data_ptr + out->numel(),
+                     static_cast<double>(out_val_if_empty));
+      }
+
+      thrust::device_ptr<float> loss_weight_data_ptr(loss_weight_data);
+      loss_weight_data_ptr[0] = 0;
+    }
+
+#endif
+  }
+};
+
+template <typename T>
+class FilterByInstagGradGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+#if defined(PADDLE_WITH_CUDA)
+
+    auto gpu_place = context.GetPlace();
+    gpuStream_t current_stream = context.cuda_device_context().stream();
+    auto max_thread_num_per_block = 1024;
+    auto* output_grad = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* x1_grad = context.Output<LoDTensor>(framework::GradVarName("Ins"));
+    auto* loss_weight = context.Input<LoDTensor>("LossWeight");
+    auto* mmap = context.Input<LoDTensor>("IndexMap");
+    auto* x1 = context.Input<LoDTensor>("Ins");
+
+    x1_grad->set_lod(context.Input<LoDTensor>("Ins")->lod());
+    x1_grad->Resize(x1->dims());
+
+    auto* mmap_data = mmap->data<int64_t>();
+    // expected auto = T
+    auto* output_grad_data = output_grad->data<T>();
+    auto* loss_weight_data = loss_weight->data<float>();
+
+    // expected auto = T
+    auto* x1_grad_data = x1_grad->mutable_data<T>(gpu_place);
+    thrust::device_ptr<T> x1_grad_data_ptr(x1_grad_data);
+    thrust::device_ptr<const float> loss_weight_data_ptr(loss_weight_data);
+
+    thrust::fill(x1_grad_data_ptr,
+                 x1_grad_data_ptr + x1->dims()[0] * x1->dims()[1], 0);
+
+    if (loss_weight->numel() != 1 || loss_weight_data_ptr[0] != 0) {
+      auto output_dims = output_grad->dims();
+      int x1_embed_size = output_dims[1];
+
+      // one thread for multi-instances
+      int block_size = max_thread_num_per_block;
+
+      size_t N = mmap->dims()[0];
+      dim3 block_dim(block_size);
+
+      dim3 grid_dim((N + block_size - 1) / block_size);
+
+      const int ins_per_thread = 1;
+
+      copy_grad_kernel<<<grid_dim, block_dim, 0, current_stream>>>(
+          N, ins_per_thread, output_grad_data, x1_grad_data, mmap_data,
+          x1_embed_size);
+
+      cudaStreamSynchronize(current_stream);
+    }
+
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(filter_by_instag, ops::FilterByInstagGPUKernel<float>,
+                        ops::FilterByInstagGPUKernel<double>,
+                        ops::FilterByInstagGPUKernel<int32_t>,
+                        ops::FilterByInstagGPUKernel<int64_t>);
+
+REGISTER_OP_CUDA_KERNEL(filter_by_instag_grad,
+                        ops::FilterByInstagGradGPUKernel<float>,
+                        ops::FilterByInstagGradGPUKernel<double>,
+                        ops::FilterByInstagGradGPUKernel<int32_t>,
+                        ops::FilterByInstagGradGPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h
index deb2aa96b539e..3abc980ceaafc 100644
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ b/paddle/fluid/operators/filter_by_instag_op.h
@@ -61,7 +61,20 @@ class FilterByInstagKernel : public framework::OpKernel<T> {
     // expected auto = const int64_t
     auto* x2_data = x2->data<int64_t>();
     // e.g get [0, 1, 2, 3, ...]
-    size_t x2_lods_size = x2->dims()[0];
+    // size_t x2_lods_size = x2->dims()[0];
+    // size_t instag_num_per_ins = x2->dims()[1];
+
+    Vector<size_t> x2_lods(1, 0);
+    if (x2->lod().size() != 0) {  // lod_level = 1
+      x2_lods = x2->lod()[0];
+    } else {  // lod_level = 0
+      const size_t x2_lods_size = x2->dims()[0];
+      const size_t instag_num_per_ins = x2->dims()[1];
+      for (size_t i = 0; i < x2_lods_size; i++) {
+        x2_lods.push_back(x2_lods.back() + instag_num_per_ins);
+      }
+    }
+
     Vector<size_t> x1_lods(1, 0);
     if (!is_x1_lod) {
       for (int i = 0; i < x1->dims()[0]; i++) {
@@ -79,8 +92,8 @@ class FilterByInstagKernel : public framework::OpKernel<T> {
     }
     std::unordered_map<int64_t, int64_t> mmap_aux;
     Vector<size_t> out_lods(1, 0);
-    for (size_t i = 0; i < x2_lods_size; i++) {
-      for (size_t j = i; j < i + 1; j++) {
+    for (size_t i = 0; i < x2_lods.size() - 1; i++) {
+      for (size_t j = x2_lods[i]; j < x2_lods[i + 1]; j++) {
         if (filter_tag.find(x2_data[j]) != filter_tag.end()) {
           size_t batch_len = x1_lods[i + 1] - x1_lods[i];
           mmap_aux[out_lods.back()] = x1_lods[i];
@@ -165,8 +178,10 @@ class FilterByInstagKernel : public framework::OpKernel<T> {
           out_data[oi] = (int32_t)out_val_if_empty;
         } else if (std::is_same<T, int64_t>::value) {
           out_data[oi] = (int64_t)out_val_if_empty;
-        } else {
+        } else if (std::is_same<T, double>::value) {
           out_data[oi] = static_cast<double>(out_val_if_empty);
+        } else {
+          out_data[oi] = static_cast<float>(out_val_if_empty);
         }
       }
       loss_weight_data[0] = 0;
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index 5ef13b38c8a86..feae954e355b8 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/phi_utils.h"
-#include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/flatten_grad_kernel.h"
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cc b/paddle/fluid/operators/fused/conv_fusion_op.cc
index c445a28c084f6..e60fc44e9a6ff 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cc
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cc
@@ -120,6 +120,142 @@ class Conv2DFusionOp : public operators::ConvOp {
       ctx->SetOutputsDim("Outputs", output_shapes);
     }
   }
+
+  std::vector<int64_t> ComputeOutputShape(
+      framework::InferShapeContext* ctx) const {
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Conv");
+    OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter", "Conv");
+
+    auto in_dims = ctx->GetInputDim("Input");
+    auto filter_dims = ctx->GetInputDim("Filter");
+
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    std::string padding_algorithm =
+        ctx->Attrs().Get<std::string>("padding_algorithm");
+    int groups = ctx->Attrs().Get<int>("groups");
+    std::vector<int> dilations =
+        ctx->Attrs().Get<std::vector<int>>("dilations");
+    int dilation_size = dilations.size();
+    for (int i = 0; i < dilation_size; ++i) {
+      PADDLE_ENFORCE_GT(
+          dilations[i], 0,
+          platform::errors::InvalidArgument(
+              "The dilation of Op(Conv) should be larget than 0, but received "
+              "dilation is %d.",
+              dilations[i]));
+    }
+    const std::string data_format =
+        ctx->Attrs().Get<std::string>("data_format");
+
+    // MKL-DNN Kernels are using NCHW order of dims description
+    // so we ignore data_format consideration for MKL-DNN kernel
+    const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) &&
+                              (data_format == "NHWC" || data_format == "NDHWC");
+
+    PADDLE_ENFORCE_EQ(
+        in_dims.size() == 4 || in_dims.size() == 5, true,
+        platform::errors::InvalidArgument(
+            "The input of Op(Conv) should be a 4-D or 5-D Tensor. But "
+            "received: input's dimension is %u, input's shape is [%s].",
+            in_dims.size(), in_dims));
+
+    PADDLE_ENFORCE_EQ(
+        in_dims.size(), filter_dims.size(),
+        platform::errors::InvalidArgument(
+            "The input's dimension and filter's dimension of "
+            "Op(Conv) should be equal. But received: the input's shape is "
+            "[%s], "
+            "the input's dimension is %d; the filter's shape is [%s],  "
+            "the filter's dimension is %d.",
+            in_dims, in_dims.size(), filter_dims, filter_dims.size()));
+
+    int stride_size = strides.size();
+    for (int i = 0; i < stride_size; ++i) {
+      PADDLE_ENFORCE_GT(
+          strides[i], 0,
+          platform::errors::InvalidArgument(
+              "The stride of Op(Conv) should be larget than 0, but received "
+              "stride is %d.",
+              strides[i]));
+    }
+
+    int in_sub_stride_size = in_dims.size() - stride_size;
+    PADDLE_ENFORCE_EQ(
+        in_dims.size(), strides.size() + 2U,
+        platform::errors::InvalidArgument(
+            "The difference of input's dimension and Attr(strides)'s "
+            "length must be euqal to 2 for Op(Conv). "
+            "But received: input's dimension is %d, input's shape is [%s]; "
+            "Attr(stride)'s length is %d, Attr(stride) is [%s]; "
+            "difference of input's dimention and Attr(strides)'s length = %u.",
+            in_dims.size(), in_dims, strides.size(), phi::make_ddim(strides),
+            in_sub_stride_size));
+
+    const auto input_channels =
+        channel_last ? in_dims[in_dims.size() - 1] : in_dims[1];
+
+    PADDLE_ENFORCE_EQ(
+        input_channels, filter_dims[1] * groups,
+        platform::errors::InvalidArgument(
+            "The number of input's channels should be equal to filter's "
+            "channels "
+            "* groups for Op(Conv). But received: the input's channels is %d, "
+            "the input's shape is [%s]; the filter's channels is %d, the "
+            "filter's shape is [%s]; the groups is %d, the data_format is %s. "
+            "The error may come from wrong data_format setting.",
+            input_channels, in_dims, filter_dims[1], filter_dims, groups,
+            data_format));
+    PADDLE_ENFORCE_EQ(
+        filter_dims[0] % groups, 0,
+        platform::errors::InvalidArgument(
+            "The number of output's channels (filter's first dimension) of "
+            "Op(Conv) should be divided by groups. But received: "
+            "the output channels is %d, the filter's shape is [%s], "
+            "the groups is %d.",
+            filter_dims[0], filter_dims, groups));
+
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_GT(
+          filter_dims[0], 0,
+          platform::errors::InvalidArgument(
+              "the size of filter at axis 0 should be greater than 0"));
+    }
+
+    framework::DDim in_data_dims;
+    if (channel_last) {
+      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    } else {
+      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
+    }
+
+    framework::DDim filter_data_dims =
+        phi::slice_ddim(filter_dims, 2, filter_dims.size());
+
+    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                             in_data_dims, strides, ksize);
+
+    std::vector<int64_t> output_shape({in_dims[0]});
+    if (!channel_last) {
+      output_shape.push_back(filter_dims[0]);
+    }
+    for (int i = 0; i < in_data_dims.size(); ++i) {
+      if ((!ctx->IsRuntime()) &&
+          (in_data_dims[i] <= 0 || filter_dims[i + 2] <= 0)) {
+        output_shape.push_back(-1);
+      } else {
+        output_shape.push_back(
+            ConvOutputSize(in_data_dims[i], filter_data_dims[i], dilations[i],
+                           paddings[2 * i], paddings[2 * i + 1], strides[i]));
+      }
+    }
+    if (channel_last) {
+      output_shape.push_back(filter_dims[0]);
+    }
+
+    return output_shape;
+  }
 };
 
 // TODO(qingqing): add gradient operator for conv2d_fusion
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 0202776757973..54e4cbdc16249 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/transpose_op.cu.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
 namespace paddle {
@@ -69,20 +70,21 @@ class FMHARef {
   ~FMHARef() {}
 
   void ComputeForward(const Tensor& qkv_input_tensor,
+                      const Tensor* cache_kv_tensor,
                       const Tensor* src_mask_tensor,
-                      Tensor* transpose_2_out_tensor, Tensor* qk_out_tensor,
+                      Tensor* transpose_2_out_tensor,
+                      Tensor* cache_kv_out_tensor, Tensor* qk_out_tensor,
                       Tensor* src_mask_out_tensor, Tensor* softmax_out_tensor,
                       Tensor* dropout_mask_out_tensor,
                       Tensor* dropout_out_tensor, Tensor* qktv_out_tensor,
                       Tensor* fmha_out_tensor) {
     // input shape: [bs, seq_len, 3, num_head, head_dim]
-    // transpose with perm [2, 0, 1, 3, 4],
+    // transpose with perm [2, 0, 3, 1, 4],
     // output_shape: [3, bs, num_head, seq_len, head_dim]
     int ndims = 5;
     std::vector<int> perm_1 = {2, 0, 3, 1, 4};
     TransposeGPUKernelDriver<T>(dev_ctx_, ndims, qkv_input_tensor, perm_1,
                                 transpose_2_out_tensor);
-
     T* qkv_data = transpose_2_out_tensor->data<T>();
     T* qk_out_data = qk_out_tensor->data<T>();
     T* qktv_out_data = qktv_out_tensor->data<T>();
@@ -90,11 +92,30 @@ class FMHARef {
     T* dropout_out_data = dropout_out_tensor->data<T>();
     T* fmha_out_data = fmha_out_tensor->data<T>();
 
-    int q_size = batch_size_ * seq_len_ * num_head_ * head_dim_;
-    int k_size = q_size;
+    auto out_seq_len = seq_len_;
+    if (cache_kv_tensor) {
+      // kv [2, bs, num_head, seq_len, head_dim]
+      auto kv_tensor = transpose_2_out_tensor->Slice(1, 3);
+      phi::funcs::ConcatFunctor<phi::GPUContext, T> concat;
+      // out [2, bs, num_head, cache_seq_len + seq_len, head_dim]
+      concat(dev_ctx_, {*cache_kv_tensor, kv_tensor}, 3, cache_kv_out_tensor);
+      out_seq_len = cache_kv_out_tensor->dims()[3];
+    }
+
+    int64_t q_size = batch_size_ * seq_len_ * num_head_ * head_dim_;
     T* q_ptr = qkv_data;
-    T* k_ptr = q_ptr + q_size;
-    T* v_ptr = k_ptr + k_size;
+    T* k_ptr = nullptr;
+    T* v_ptr = nullptr;
+
+    if (cache_kv_tensor) {
+      int64_t k_size = cache_kv_out_tensor->numel() / 2;
+      k_ptr = cache_kv_out_tensor->data<T>();
+      v_ptr = k_ptr + k_size;
+    } else {
+      int64_t k_size = q_size;
+      k_ptr = q_ptr + q_size;
+      v_ptr = k_ptr + k_size;
+    }
 
     // q*k^t, batched_gemm
     CBLAS_TRANSPOSE transA = CblasNoTrans;
@@ -102,7 +123,7 @@ class FMHARef {
     auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
     int gemm_batch_size = batch_size_ * num_head_;
     int gemm_m = seq_len_;
-    int gemm_n = seq_len_;
+    int gemm_n = out_seq_len;
     int gemm_k = head_dim_;
     T alpha = static_cast<T>(1.0 / sqrt(head_dim_));
     T beta = static_cast<T>(0.0);
@@ -133,16 +154,16 @@ class FMHARef {
     transB = CblasNoTrans;
     gemm_m = seq_len_;
     gemm_n = head_dim_;
-    gemm_k = seq_len_;
+    gemm_k = out_seq_len;
     alpha = static_cast<T>(1.0);
     stride_a = gemm_m * gemm_k;
     stride_b = gemm_k * gemm_n;
 
     if (dropout_param_.dropout_prob_) {
       DropoutFwGPUKernelDriver<T>(
-          dev_ctx_, dropout_param_.is_test_,
-          static_cast<const std::string>(
-              dropout_param_.dropout_implementation_),
+          static_cast<const phi::GPUContext&>(dev_ctx_),
+          dropout_param_.is_test_, static_cast<const std::string>(
+                                       dropout_param_.dropout_implementation_),
           dropout_param_.dropout_prob_, dropout_param_.is_upscale_in_train_,
           dropout_param_.is_fix_seed_, dropout_param_.seed_val_,
           static_cast<const Tensor&>(*softmax_out_tensor), dropout_param_.seed_,
@@ -242,8 +263,9 @@ class FMHARef {
     // dropout bw
     if (dropout_param_.dropout_prob_) {
       DropoutGradGPUKernelDriver<T>(
-          dev_ctx_, static_cast<const std::string>(
-                        dropout_param_.dropout_implementation_),
+          static_cast<const phi::GPUContext&>(dev_ctx_),
+          static_cast<const std::string>(
+              dropout_param_.dropout_implementation_),
           dropout_param_.dropout_prob_,
           static_cast<const Tensor&>(*dropout_out_grad_tensor),
           dropout_mask_out_tensor, softmax_out_grad_tensor->numel(),
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index d141800d61c0e..e473f8ff0662c 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -61,6 +61,10 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasOutput("QKTVOut"), "Output", "QKTVOut",
                    "FusedAttentionOp");
 
+    if (ctx->HasInput("CacheKV")) {
+      OP_INOUT_CHECK(ctx->HasOutput("CacheKVOut"), "Output", "CacheKVOut",
+                     "FusedAttentionOp");
+    }
     if (ctx->HasInput("SrcMask")) {
       OP_INOUT_CHECK(ctx->HasOutput("SrcMaskOut"), "Output", "SrcMaskOut",
                      "FusedAttentionOp");
@@ -105,12 +109,14 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
                           "input qkv_weight = [%s]",
                           x_dim, y_dim));
 
-    PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], y_dim[3],
-                      platform::errors::InvalidArgument(
-                          "The dimensions of qkv_weight must be 4"
-                          "(3, num_head, dim_head, dim_embed),"
-                          "and must satisfy the limitations: "
-                          "(num_head * dim_head == dim_embed)"));
+    if (ctx->Attrs().Get<int>("ring_id") == -1) {
+      PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], y_dim[3],
+                        platform::errors::InvalidArgument(
+                            "The dimensions of qkv_weight must be 4"
+                            "(3, num_head, dim_head, dim_embed),"
+                            "and must satisfy the limitations: "
+                            "(num_head * dim_head == dim_embed)"));
+    }
 
     if (ctx->Attrs().Get<bool>("pre_layer_norm") == true) {
       ctx->SetOutputDim("LnMean", {x_dim[0] * x_dim[1]});
@@ -132,20 +138,64 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     // [3, batch_size, num_head, seq_len, head_size]
     ctx->SetOutputDim("TransposeOut2",
                       {y_dim[0], x_dim[0], y_dim[1], x_dim[1], y_dim[2]});
-    // [batch, num_head, seq_len, seq_len]
-    ctx->SetOutputDim("QKOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+
+    // cache_seq_len + seq_len if cache else seq_len
+    auto out_seq_len = x_dim[1];
+    if (ctx->HasInput("CacheKV")) {
+      // [2, batch_size, num_head, cache_seq_len, head_size]
+      auto c_dim = ctx->GetInputDim("CacheKV");
+
+      PADDLE_ENFORCE_EQ(
+          c_dim.size(), 5,
+          paddle::platform::errors::InvalidArgument(
+              "The CacheKV must be 5 dims, but got %d", c_dim.size()));
+      PADDLE_ENFORCE_EQ(c_dim[0], 2,
+                        paddle::platform::errors::InvalidArgument(
+                            "The first dim of CacheKV must be 2, but got %d",
+                            c_dim[0]));  // 2
+      PADDLE_ENFORCE_EQ(c_dim[1], x_dim[0],
+                        paddle::platform::errors::InvalidArgument(
+                            "The second dim of CacheKV must be equal with "
+                            "batch size %d, but got %d",
+                            x_dim[0], c_dim[1]));  // batch_size
+      PADDLE_ENFORCE_EQ(c_dim[2], y_dim[1],
+                        paddle::platform::errors::InvalidArgument(
+                            "The third dim of CacheKV must be equal with num "
+                            "head %d, but got %d",
+                            y_dim[1], c_dim[2]));  // num_head
+      PADDLE_ENFORCE_GE(
+          c_dim[3], 0,
+          paddle::platform::errors::InvalidArgument(
+              "The forth dim of CacheKV must be greater than 0, but got %d",
+              c_dim[3]));  // cache_seq_len
+      PADDLE_ENFORCE_EQ(c_dim[4], y_dim[2],
+                        paddle::platform::errors::InvalidArgument(
+                            "The fifth dim of CacheKV must be equal with head "
+                            "size %d, but got %d",
+                            y_dim[2], c_dim[4]));  // head_size
+
+      out_seq_len += c_dim[3];
+      // [3, batch_size, num_head, cache_seq_len + seq_len, head_size]
+      ctx->SetOutputDim("CacheKVOut",
+                        {c_dim[0], c_dim[1], c_dim[2], out_seq_len, c_dim[4]});
+    }
+
+    // [batch, num_head, seq_len, out_seq_len]
+    ctx->SetOutputDim("QKOut", {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
 
     if (ctx->HasInput("SrcMask")) {
-      ctx->SetOutputDim("SrcMaskOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+      ctx->SetOutputDim("SrcMaskOut",
+                        {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
     }
     // the same as QKOut's shape.
     ctx->SetOutputDim("AttnDropoutOut",
-                      {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+                      {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
     if (ctx->Attrs().Get<bool>("attn_dropout_is_test") == false) {
       ctx->SetOutputDim("AttnDropoutMaskOut",
-                        {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+                        {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
     }
-    ctx->SetOutputDim("SoftmaxOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+    ctx->SetOutputDim("SoftmaxOut",
+                      {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
     // [batch_size, num_heads, seq_len, head_dim]
     ctx->SetOutputDim("QKTVOut", {x_dim[0], y_dim[1], x_dim[1], y_dim[2]});
     // [batch_size, seq_len, number of heads*head size]
@@ -182,6 +232,8 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDispensable();
     AddInput("QKVW", "The qkv weight tensor.");
     AddInput("QKVBias", "The qkv bias tensor.").AsDispensable();
+    AddInput("CacheKV", "(optional) The cached KV for generation inference.")
+        .AsDispensable();
     AddInput("SrcMask", "(optional) The attention mask tensor in fmha.")
         .AsDispensable();
     AddInput("OutLinearW", "The out_linear weight tensor.");
@@ -217,6 +269,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("BiasDropoutResidualOut",
               "Result of residual + dropout(src + bias).")
         .AsIntermediate();
+    AddOutput("CacheKVOut", "The udpated cache KV.");
     AddOutput("Y", "Result after attention.");
 
     AddAttr<bool>("pre_layer_norm",
@@ -324,6 +377,10 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
                                 "0.0 and 0.001, But received [%s].",
                                 ln_epsilon));
         });
+    AddAttr<int>(
+        "ring_id",
+        "ring id for tensor model parallel. distributed training and inference")
+        .SetDefault(-1);
 
     AddComment(R"DOC(
   Add fused attention op whose logic is as follows:
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index 03f51fc585798..d26577f06fe68 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -27,11 +27,39 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fmha_ref.h"
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 
+template <typename T>
+static void AllReduce(framework::Tensor &tensor,  // NOLINT
+                      const int ring_id,
+                      const platform::CUDADeviceContext &ctx) {
+  if (ring_id == -1) return;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  auto dtype =
+      platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype()));
+  int64_t numel = tensor.numel();
+  const void *sendbuff = tensor.data<T>();
+  auto place = ctx.GetPlace();
+  void *recvbuff = tensor.mutable_data<T>(place);
+  auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+  auto stream = ctx.stream();
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+      sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
+#else
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "PaddlePaddle should compile with NCCL or RCCL when used tensor model "
+      "parallel op."));
+#endif
+}
+
 template <typename T>
 class FusedAttentionOpKernel : public framework::OpKernel<T> {
  public:
@@ -56,6 +84,8 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
 
     auto *src_mask = ctx.Input<Tensor>("SrcMask");
     auto *transpose_out_2 = ctx.Output<Tensor>("TransposeOut2");
+    auto *cache_kv = ctx.Input<Tensor>("CacheKV");
+    auto *cache_kv_out = ctx.Output<Tensor>("CacheKVOut");
     auto *qk_out = ctx.Output<Tensor>("QKOut");
     auto *qktv_out = ctx.Output<Tensor>("QKTVOut");
     auto *softmax_out = ctx.Output<Tensor>("SoftmaxOut");
@@ -86,6 +116,7 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
     auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input<Tensor>("Seed1") : nullptr;
     bool is_fix_seed_1 = ctx.Attr<bool>("attn_dropout_fix_seed");
     int seed_val_1 = ctx.Attr<int>("attn_dropout_seed");
+    int ring_id = ctx.Attr<int>("ring_id");
 
     // final output.
     auto *out = ctx.Output<Tensor>("Y");
@@ -105,6 +136,10 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
     // get data ptr for FMHA.
     auto *transpose_out_2_data =
         transpose_out_2->mutable_data<T>(ctx.GetPlace());
+    auto *cache_kv_out_data =
+        (cache_kv_out == nullptr)
+            ? nullptr
+            : cache_kv_out->mutable_data<T>(ctx.GetPlace());
     auto *qk_out_data = qk_out->mutable_data<T>(ctx.GetPlace());
     auto *qktv_out_data = qktv_out->mutable_data<T>(ctx.GetPlace());
     auto *src_mask_out_data =
@@ -161,9 +196,14 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
 
     output_size = hidden_size;
     // (transA, transB, compute_bias) = (false, false, false)
+    // NOTE(Yuang Liu): For general input size == output size, change the
+    // position won't have effects. For mp, the output size is mp_head * dkey
+    // which is actually the input size. While the input size is hidden size,
+    // which is actually the output size. So for out linear, switch the
+    // input size and output size.
     auto out_linear_compute =
         AttnMatMul<T>(ctx.cuda_device_context(), false, false, bsz_seq,
-                      output_size, input_size, false);
+                      input_size, output_size, false);
     DropoutParam dropout_param2(ctx, 0);
     FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
         ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2,
@@ -186,15 +226,15 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
                                  qkv_bias_out);
     }
     if (qkv_bias == nullptr) {
-      fmha_ref_compute.ComputeForward(*qkv_out, src_mask, transpose_out_2,
-                                      qk_out, src_mask_out, softmax_out,
-                                      attn_dropout_mask_out, attn_dropout_out,
-                                      qktv_out, fmha_out);
+      fmha_ref_compute.ComputeForward(
+          *qkv_out, cache_kv, src_mask, transpose_out_2, cache_kv_out, qk_out,
+          src_mask_out, softmax_out, attn_dropout_mask_out, attn_dropout_out,
+          qktv_out, fmha_out);
     } else {
-      fmha_ref_compute.ComputeForward(*qkv_bias_out, src_mask, transpose_out_2,
-                                      qk_out, src_mask_out, softmax_out,
-                                      attn_dropout_mask_out, attn_dropout_out,
-                                      qktv_out, fmha_out);
+      fmha_ref_compute.ComputeForward(
+          *qkv_bias_out, cache_kv, src_mask, transpose_out_2, cache_kv_out,
+          qk_out, src_mask_out, softmax_out, attn_dropout_mask_out,
+          attn_dropout_out, qktv_out, fmha_out);
     }
 
     // fmha_out: [batch_size, seq_len, num_head, head_dim]
@@ -202,6 +242,9 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
     // out_linear_out: [batch_size, seq_len, embed_dim]
     out_linear_compute.ComputeForward(out_linear_weight, fmha_out, nullptr,
                                       out_linear_out, nullptr);
+    // tensor model parallel
+    AllReduce<T>(*out_linear_out, ring_id, ctx.cuda_device_context());
+
     if (pre_layer_norm) {
       // output = (residual + dropout(input + bias))
       fused_dropout_layernorm_helper.ResidualDropoutBias(
@@ -244,6 +287,7 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input<Tensor>("Seed1") : nullptr;
     bool is_fix_seed_1 = ctx.Attr<bool>("attn_dropout_fix_seed");
     int seed_val_1 = ctx.Attr<int>("attn_dropout_seed");
+    int ring_id = ctx.Attr<int>("ring_id");
 
     // get inputs.
     auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
@@ -399,9 +443,10 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     transA = false;
     transB = false;
     bool compute_bias = false;
+    // (b*s, num_head * dim_head) * (num_head * dim_head, dim_embed)
     auto out_linear_compute =
         AttnMatMul<T>(ctx.cuda_device_context(), transA, transB, bsz_seq,
-                      output_size, input_size, compute_bias);
+                      input_size, output_size, compute_bias);
     DropoutParam dropout_param2(ctx, 0);
     FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
         ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2,
@@ -475,6 +520,8 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
         qkv_compute.ComputeBackward(ln_out, qkv_weight, d_qkv_out, d_ln_out,
                                     d_qkv_weight, d_qkv_bias);
       }
+      // tensor model parallel
+      AllReduce<T>(*d_ln_out, ring_id, ctx.cuda_device_context());
       layer_norm_compute.ComputeBackward(x_data, d_ln_out_data, ln_scale_data,
                                          ln_mean_data, ln_var_data, d_x_data,
                                          d_ln_scale_data, d_ln_bias_data);
@@ -486,6 +533,8 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
         qkv_compute.ComputeBackward(input_x, qkv_weight, d_qkv_out, d_x,
                                     d_qkv_weight, d_qkv_bias);
       }
+      // tensor model parallel
+      AllReduce<T>(*d_x, ring_id, ctx.cuda_device_context());
     }
     // gradient accumulation
     std::vector<const Tensor *> ins;
diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
index 2381b5b7fdfb8..717c1732b7b3a 100644
--- a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
@@ -20,8 +20,14 @@ limitations under the License. */
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/fused/fused_dropout_act_bias.h"
 #include "paddle/fluid/operators/fused/fused_dropout_test.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT);
+#endif
+
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace details = paddle::operators::details;
diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h
index d7952df470d81..a9b72a9cdf397 100644
--- a/paddle/fluid/operators/fused/fused_dropout_test.h
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
@@ -25,14 +25,16 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/layer_norm_kernel.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace memory = paddle::memory;
 
-USE_OP(dropout);
-USE_OP(layer_norm);
+USE_OP_ITSELF(dropout);
+USE_OP_ITSELF(layer_norm);
 
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
@@ -136,18 +138,23 @@ void LayerNorm(const std::vector<LayerNormParamType<T>> &scale,
                const platform::CUDADeviceContext &ctx) {
   framework::Scope scope;
   auto place = ctx.GetPlace();
+  paddle::optional<const framework::LoDTensor &> scale_opt = paddle::none;
   if (scale.size() > 0) {
     auto var_scale = scope.Var("Scale");
     auto tensor_scale = var_scale->GetMutable<framework::LoDTensor>();
     framework::TensorFromVector(scale, ctx, tensor_scale);
     tensor_scale->Resize({cols});
+    scale_opt = *tensor_scale;
   }
 
+  paddle::optional<const framework::LoDTensor &> bias_opt = paddle::none;
   if (bias.size() > 0) {
     auto var_bias = scope.Var("Bias");
     auto tensor_bias = var_bias->GetMutable<framework::LoDTensor>();
     framework::TensorFromVector(bias, ctx, tensor_bias);
     tensor_bias->Resize({cols});
+
+    bias_opt = *tensor_bias;
   }
 
   auto var_x = scope.Var("X");
@@ -157,20 +164,19 @@ void LayerNorm(const std::vector<LayerNormParamType<T>> &scale,
 
   auto var_y = scope.Var("Y");
   auto tensor_y = var_y->GetMutable<framework::LoDTensor>();
+  tensor_y->Resize({rows, cols});
 
   auto var_mean = scope.Var("Mean");
   auto tensor_mean = var_mean->GetMutable<framework::LoDTensor>();
+  tensor_mean->Resize({rows});
 
   auto var_variance = scope.Var("Variance");
   auto tensor_variance = var_variance->GetMutable<framework::LoDTensor>();
-
-  framework::AttributeMap attrs;
-  attrs.insert({"epsilon", epsilon});
-
-  auto op = framework::OpRegistry::CreateOp(
-      "layer_norm", {{"X", {"X"}}, {"Scale", {"Scale"}}, {"Bias", {"Bias"}}},
-      {{"Y", {"Y"}}, {"Mean", {"Mean"}}, {"Variance", {"Variance"}}}, attrs);
-  op->Run(scope, place);
+  tensor_variance->Resize({rows});
+  ctx.Wait();
+  phi::LayerNormKernel<T>(static_cast<const phi::GPUContext &>(ctx), *tensor_x,
+                          scale_opt, bias_opt, 1e-5, 1, false, tensor_y,
+                          tensor_mean, tensor_variance);
   framework::TensorToVector(*tensor_y, ctx, y);
   framework::TensorToVector(*tensor_mean, ctx, means);
   framework::TensorToVector(*tensor_variance, ctx, vars);
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
index cc14d0680d381..c7e1f4a5463fe 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
@@ -19,6 +19,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_dropout_test.h"
 #include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT);
+#endif
 
 /**
  * @brief The unit test of fused_layernorm_residual_dropout_bias
@@ -192,7 +198,6 @@ struct TestFusedLayernormResidualDropoutBias {
             residual_vec[i * cols + j] + out2[i * cols + j];
       }
     }
-
     LayerNorm<T>(scale_vec, layernorm_bias_vec, correct_out, &correct_means,
                  &correct_vars, &correct_layernorm_out, epsilon, rows, cols,
                  *ctx);
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
index 1a12e6b565f02..5dff5e2225f4f 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
@@ -19,6 +19,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_dropout_test.h"
 #include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT);
+#endif
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc
index e5ca15a39ef51..7d7d6ae81a093 100644
--- a/paddle/fluid/operators/gather_nd_op.cc
+++ b/paddle/fluid/operators/gather_nd_op.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/infermeta/backward.h"
 #include "paddle/phi/infermeta/binary.h"
-#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 8a405cc6fc1ba..9f2b48a24b447 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -12,12 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -26,58 +31,6 @@ class GatherOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of GatherOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) of GatherOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of GatherOp should not be null."));
-
-    auto index_dims = ctx->GetInputDim("Index");
-
-    if (index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(
-          index_dims[1], 1,
-          platform::errors::InvalidArgument(
-              "The last dim of index should be 1 when it is 2D, but we get %d",
-              index_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          index_dims.size(), 1,
-          platform::errors::InvalidArgument(
-              "The index should be 1D, when it is not 2D, but we get %d",
-              index_dims.size()));
-    }
-
-    auto axis = ctx->Attrs().Get<int>("axis");
-    auto input_dim = ctx->GetInputDim("X");
-    if (ctx->HasInput("Axis") || axis == 0) {
-      // if HasInput("Axis"), we can not obtain correct shape of output
-      int batch_size = index_dims[0];
-      framework::DDim output_dims(input_dim);
-      output_dims[0] = batch_size;
-      ctx->SetOutputDim("Out", output_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
-    } else {
-      int index_size = index_dims[0];
-      std::vector<int> out_dim_vec;
-      for (int i = 0; i < axis; i++) {
-        out_dim_vec.push_back(input_dim[i]);
-      }
-      out_dim_vec.push_back(index_size);
-      for (int i = axis + 1; i < input_dim.size(); i++) {
-        out_dim_vec.push_back(input_dim[i]);
-      }
-      auto output_dims = phi::make_ddim(out_dim_vec);
-      ctx->SetOutputDim("Out", output_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -100,11 +53,6 @@ class GatherGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X"));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -193,22 +141,18 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherGradNoNeedBufferVarInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(gather, GatherInferShapeFunctor,
+                            PD_INFER_META(phi::GatherInferMeta));
 REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
                   ops::GatherGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GatherGradOpMaker<paddle::imperative::OpBase>);
+                  ops::GatherGradOpMaker<paddle::imperative::OpBase>,
+                  GatherInferShapeFunctor);
+DECLARE_INFER_SHAPE_FUNCTOR(gather_grad, GatherGradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralUnaryGradInferMeta));
 REGISTER_OPERATOR(gather_grad, ops::GatherGradOp,
-                  ops::GatherGradNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
-                       ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
-                       ops::GatherOpKernel<uint8_t>,
-                       ops::GatherOpKernel<int64_t>,
-                       ops::GatherOpKernel<phi::dtype::bfloat16>);
-REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
-                       ops::GatherGradientOpKernel<double>,
-                       ops::GatherGradientOpKernel<int>,
-                       ops::GatherGradientOpKernel<uint8_t>,
-                       ops::GatherGradientOpKernel<int64_t>,
-                       ops::GatherGradientOpKernel<phi::dtype::bfloat16>);
+                  ops::GatherGradNoNeedBufferVarInferer,
+                  GatherGradInferShapeFunctor);
+
 REGISTER_OP_VERSION(gather)
     .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC",
                    paddle::framework::compatible::OpVersionDesc().NewInput(
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
deleted file mode 100644
index e0db2f26d3e05..0000000000000
--- a/paddle/fluid/operators/gather_op.cu
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/gather_op.h"
-#include "paddle/phi/kernels/funcs/gather.cu.h"
-#include "paddle/phi/kernels/funcs/scatter.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class GatherOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *output = ctx.Output<Tensor>("Out");
-
-    int axis = ctx.Attr<int>("axis");
-
-    // get axis from tensor
-    if (ctx.HasInput("Axis")) {
-      Tensor cpu_axis;
-      const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis);
-      const auto &axis_type =
-          framework::TransToProtoVarType(axis_tensor->dtype());
-      if (axis_type == framework::proto::VarType::INT32) {
-        axis = static_cast<int>(cpu_axis.data<int32_t>()[0]);
-      } else if (axis_type == framework::proto::VarType::INT64) {
-        axis = static_cast<int>(cpu_axis.data<int64_t>()[0]);
-      } else if (axis_type == framework::proto::VarType::INT16) {
-        axis = static_cast<int>(cpu_axis.data<int16_t>()[0]);
-      }
-    }
-    const auto &place = ctx.GetPlace();
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    const auto &dev_ctx = ctx.cuda_device_context();
-    if (axis != 0) {
-      if (index_type == framework::proto::VarType::INT32) {
-        phi::funcs::GatherV2CUDAFunction<T, int32_t>(x, index, axis, output,
-                                                     dev_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        phi::funcs::GatherV2CUDAFunction<T, int64_t>(x, index, axis, output,
-                                                     dev_ctx);
-      } else if (index_type == framework::proto::VarType::INT16) {
-        phi::funcs::GatherV2CUDAFunction<T, int16_t>(x, index, axis, output,
-                                                     dev_ctx);
-      }
-      return;
-    }
-
-    output->mutable_data<T>(ctx.GetPlace());
-    if (x->numel() == 0) return;
-    if (index_type == framework::proto::VarType::INT32) {
-      phi::funcs::GPUGather<T, int>(dev_ctx, *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      phi::funcs::GPUGather<T, int64_t>(dev_ctx, *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT16) {
-      phi::funcs::GPUGather<T, int16_t>(dev_ctx, *x, *index, output);
-    }
-  }
-};
-
-template <typename T>
-class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    int axis = ctx.Attr<int>("axis");
-    if (ctx.HasInput("Axis")) {
-      const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      Tensor cpu_axis;
-      framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis);
-      const auto &axis_type =
-          framework::TransToProtoVarType(axis_tensor->dtype());
-      if (axis_type == framework::proto::VarType::INT32) {
-        axis = static_cast<int>(cpu_axis.data<int32_t>()[0]);
-      } else if (axis_type == framework::proto::VarType::INT64) {
-        axis = static_cast<int>(cpu_axis.data<int64_t>()[0]);
-      }
-    }
-
-    const auto &dev_ctx = ctx.cuda_device_context();
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (axis != 0) {
-      if (index_type == framework::proto::VarType::INT32) {
-        phi::funcs::GatherV2GradCUDAFunction<T, int32_t>(dO, index, axis, dX,
-                                                         dev_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        phi::funcs::GatherV2GradCUDAFunction<T, int64_t>(dO, index, axis, dX,
-                                                         dev_ctx);
-      }
-      return;
-    }
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-    if (index_type == framework::proto::VarType::INT32) {
-      phi::funcs::GPUScatterAssign<T, int>(dev_ctx, *dO, *index, dX,
-                                           ctx.Attr<bool>("overwrite"));
-    } else if (index_type == framework::proto::VarType::INT64) {
-      phi::funcs::GPUScatterAssign<T, int64_t>(dev_ctx, *dO, *index, dX,
-                                               ctx.Attr<bool>("overwrite"));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
-                        ops::GatherOpCUDAKernel<double>,
-                        ops::GatherOpCUDAKernel<int64_t>,
-                        ops::GatherOpCUDAKernel<int>,
-                        ops::GatherOpCUDAKernel<int16_t>,
-                        ops::GatherOpCUDAKernel<plat::float16>,
-                        ops::GatherOpCUDAKernel<plat::bfloat16>);
-REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
-                        ops::GatherGradOpCUDAKernel<double>,
-                        ops::GatherGradOpCUDAKernel<int64_t>,
-                        ops::GatherGradOpCUDAKernel<int>,
-                        ops::GatherGradOpCUDAKernel<plat::float16>,
-                        ops::GatherGradOpCUDAKernel<plat::bfloat16>);
diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
deleted file mode 100644
index 94de694b2f9bc..0000000000000
--- a/paddle/fluid/operators/gather_op.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/gather.h"
-#include "paddle/phi/kernels/funcs/scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class GatherOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *output = ctx.Output<Tensor>("Out");
-
-    int axis = ctx.Attr<int>("axis");
-    // get axis from tensor
-    if (ctx.HasInput("Axis")) {
-      const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      const auto &axis_type = axis_tensor->dtype();
-      if (axis_type == phi::DataType::INT32) {
-        axis = static_cast<int>(axis_tensor->data<int32_t>()[0]);
-      } else if (axis_type == phi::DataType::INT64) {
-        axis = static_cast<int>(axis_tensor->data<int64_t>()[0]);
-      }
-    }
-    const auto &index_type = index->dtype();
-    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-    if (axis != 0) {
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::GatherV2Function<T, int32_t>(dev_ctx, x, index, axis,
-                                                 output);
-      } else if (index_type == phi::DataType::INT64) {
-        phi::funcs::GatherV2Function<T, int64_t>(dev_ctx, x, index, axis,
-                                                 output);
-      }
-      return;
-    }
-
-    output->mutable_data<T>(ctx.GetPlace());
-    if (x->numel() == 0) return;
-    if (index_type == phi::DataType::INT32) {
-      phi::funcs::CPUGather<T, int>(dev_ctx, *x, *index, output);
-    } else if (index_type == phi::DataType::INT64) {
-      phi::funcs::CPUGather<T, int64_t>(dev_ctx, *x, *index, output);
-    }
-  }
-};
-
-template <typename T>
-class GatherGradientOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    int axis = ctx.Attr<int>("axis");
-    if (ctx.HasInput("Axis")) {
-      const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      const auto &axis_type = axis_tensor->dtype();
-      if (axis_type == phi::DataType::INT32) {
-        axis = static_cast<int>(axis_tensor->data<int32_t>()[0]);
-      } else if (axis_type == phi::DataType::INT64) {
-        axis = static_cast<int>(axis_tensor->data<int64_t>()[0]);
-      }
-    }
-    const auto &index_type = index->dtype();
-    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-
-    if (axis != 0) {
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::GatherV2GradFunction<T, int32_t>(dev_ctx, dO, index, axis,
-                                                     dX);
-      } else if (index_type == phi::DataType::INT64) {
-        phi::funcs::GatherV2GradFunction<T, int64_t>(dev_ctx, dO, index, axis,
-                                                     dX);
-      }
-      return;
-    }
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *dev_ctx.eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-    bool overwrite = ctx.Attr<bool>("overwrite");
-
-    if (index_type == phi::DataType::INT32) {
-      if (overwrite) {
-        phi::funcs::ScatterAssign<T, int32_t>(dev_ctx, *dO, *index, dX);
-      } else {
-        phi::funcs::ScatterAssignAdd<T, int32_t>(dev_ctx, *dO, *index, dX);
-      }
-    } else if (index_type == phi::DataType::INT64) {
-      if (overwrite) {
-        phi::funcs::ScatterAssign<T, int64_t>(dev_ctx, *dO, *index, dX);
-      } else {
-        phi::funcs::ScatterAssignAdd<T, int64_t>(dev_ctx, *dO, *index, dX);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc
index a83abb245224b..f996b1ede2f0f 100644
--- a/paddle/fluid/operators/gather_op_npu.cc
+++ b/paddle/fluid/operators/gather_op_npu.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/kron_op.h"
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc
index 3dce380360815..b42050eabe300 100644
--- a/paddle/fluid/operators/gather_op_npu_test.cc
+++ b/paddle/fluid/operators/gather_op_npu_test.cc
@@ -24,16 +24,15 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/gather_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(gather);
+USE_OP_ITSELF(gather);
 USE_OP_DEVICE_KERNEL(gather, NPU);
-USE_OP(gather_grad);
+USE_OP_ITSELF(gather_grad);
 USE_OP_DEVICE_KERNEL(gather_grad, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/gather_op_xpu.cc b/paddle/fluid/operators/gather_op_xpu.cc
index 28f2f7d473bef..6c691aa14ae77 100644
--- a/paddle/fluid/operators/gather_op_xpu.cc
+++ b/paddle/fluid/operators/gather_op_xpu.cc
@@ -13,15 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/gather_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/ddim.h"
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename T>
 class GatherOpXPUKernel : public framework::OpKernel<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
diff --git a/paddle/fluid/operators/gelu_op.cc b/paddle/fluid/operators/gelu_op.cc
index 3d338f00d4fcb..3be2606bfc939 100644
--- a/paddle/fluid/operators/gelu_op.cc
+++ b/paddle/fluid/operators/gelu_op.cc
@@ -14,10 +14,11 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
-#include <unordered_map>
-
-#include "paddle/fluid/operators/gelu_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -29,18 +30,6 @@ class GeluOp : public framework::OperatorWithKernel {
          const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(%s) of GeluOp should not be null.", "X"));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(%s) of GeluOp should not be null.", "Out"));
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -156,13 +145,10 @@ class GeluGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(gelu, GeluInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(gelu, ops::GeluOp, ops::GeluOpMaker,
                   ops::GeluGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GeluGradOpMaker<paddle::imperative::OpBase>);
+                  ops::GeluGradOpMaker<paddle::imperative::OpBase>,
+                  GeluInferShapeFunctor);
 REGISTER_OPERATOR(gelu_grad, ops::GeluGradOp);
-REGISTER_OP_CPU_KERNEL(
-    gelu, ops::GeluKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GeluKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    gelu_grad, ops::GeluGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GeluGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu
deleted file mode 100644
index ef836ab72f001..0000000000000
--- a/paddle/fluid/operators/gelu_op.cu
+++ /dev/null
@@ -1,320 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/gelu_op.h"
-
-DECLARE_bool(use_fast_math);
-
-namespace paddle {
-namespace operators {
-
-#ifdef __NVCC__
-template <bool FastMode>
-static __device__ __forceinline__ float FP32FastTanh(float x) {
-#if __CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000
-  if (FastMode) {
-    float y;
-    asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(y) : "f"(x));
-    return y;
-  }
-#endif
-  return tanhf(x);
-}
-
-template <bool FastMode>
-static __device__ __forceinline__ float FP32GeluFwd(float x) {
-  auto tanh_out =
-      FP32FastTanh<FastMode>(0.79788456f * x * (1.0f + 0.044715f * x * x));
-  return x * 0.5f * (1.0f + tanh_out);
-}
-
-template <bool FastMode>
-static __device__ __forceinline__ float FP32GeluBwd(float x, float y_g) {
-  auto tanh_out =
-      FP32FastTanh<FastMode>(0.79788456f * x * (1.0f + 0.044715f * x * x));
-  auto tmp = 0.5f * x * ((1.0f - tanh_out * tanh_out) *
-                         (0.79788456f + 0.1070322243f * x * x)) +
-             0.5f * (1.0f + tanh_out);
-  return tmp * y_g;
-}
-
-template <int VecSize, bool FastMode>
-static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, __half* y,
-                                                 size_t n) {
-  size_t offset =
-      static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
-  size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
-  for (; offset < n; offset += stride) {
-    using ArrT = phi::AlignedVector<__half, VecSize>;
-    ArrT in_arr = *reinterpret_cast<const ArrT*>(x + offset);
-#pragma unroll
-    for (int i = 0; i < VecSize; ++i) {
-      float tmp = __half2float(in_arr[i]);
-      in_arr[i] = __float2half(FP32GeluFwd<FastMode>(tmp));
-    }
-    *reinterpret_cast<ArrT*>(y + offset) = in_arr;
-  }
-}
-
-template <int VecSize, bool FastMode>
-static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x,
-                                                 const __half* y_g, __half* x_g,
-                                                 size_t n) {
-  size_t offset =
-      static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
-  size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
-  for (; offset < n; offset += stride) {
-    using ArrT = phi::AlignedVector<__half, VecSize>;
-    ArrT x_in_arr = *reinterpret_cast<const ArrT*>(x + offset);
-    ArrT y_g_in_arr = *reinterpret_cast<const ArrT*>(y_g + offset);
-#pragma unroll
-    for (int i = 0; i < VecSize; ++i) {
-      __half2 tmp_fp16_2;
-      tmp_fp16_2.x = x_in_arr[i];
-      tmp_fp16_2.y = y_g_in_arr[i];
-      float2 tmp_fp32_2 = __half22float2(tmp_fp16_2);
-      x_in_arr[i] =
-          __float2half(FP32GeluBwd<FastMode>(tmp_fp32_2.x, tmp_fp32_2.y));
-    }
-    *reinterpret_cast<ArrT*>(x_g + offset) = x_in_arr;
-  }
-}
-
-static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
-    const platform::CUDADeviceContext& dev_ctx, const __half* x, __half* y,
-    size_t n) {
-  auto is_aligned = [](const void* p, size_t alignment) {
-    return reinterpret_cast<uintptr_t>(p) % alignment == 0;
-  };
-
-#define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math)      \
-  do {                                                                        \
-    constexpr auto kAlignment =                                               \
-        alignof(phi::AlignedVector<__half, __vec_size>);                      \
-    if (n % __vec_size == 0 && is_aligned(x, kAlignment) &&                   \
-        is_aligned(y, kAlignment)) {                                          \
-      size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
-      size_t block = (n / __vec_size + thread - 1) / thread;                  \
-      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
-      VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block          \
-               << " , thread = " << thread;                                   \
-      FP16FastGeluFwdCUDAKernel<                                              \
-          __vec_size,                                                         \
-          __use_fast_math><<<block, thread, 0, dev_ctx.stream()>>>(x, y, n);  \
-      return true;                                                            \
-    }                                                                         \
-  } while (0)
-
-  if (FLAGS_use_fast_math) {
-    PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, true);
-  } else {
-    PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, false);
-  }
-
-#undef PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL
-  return false;
-}
-
-static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
-    const platform::CUDADeviceContext& dev_ctx, const __half* x,
-    const __half* y_g, __half* x_g, size_t n) {
-  auto is_aligned = [](const void* p, size_t alignment) {
-    return reinterpret_cast<uintptr_t>(p) % alignment == 0;
-  };
-
-#define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math)      \
-  do {                                                                        \
-    constexpr auto kAlignment =                                               \
-        alignof(phi::AlignedVector<__half, __vec_size>);                      \
-    if (n % __vec_size == 0 && is_aligned(x, kAlignment) &&                   \
-        is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) &&           \
-        is_aligned(x_g, kAlignment)) {                                        \
-      size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
-      size_t block = (n / __vec_size + thread - 1) / thread;                  \
-      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
-      VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block          \
-               << " , thread = " << thread;                                   \
-      FP16FastGeluBwdCUDAKernel<                                              \
-          __vec_size,                                                         \
-          __use_fast_math><<<block, thread, 0, dev_ctx.stream()>>>(x, y_g,    \
-                                                                   x_g, n);   \
-      return true;                                                            \
-    }                                                                         \
-  } while (0)
-
-  if (FLAGS_use_fast_math) {
-    PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, true);
-  } else {
-    PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, false);
-  }
-
-#undef PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL
-  return false;
-}
-#endif
-
-template <typename T>
-struct GeluWithApproximateFunctor {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  inline HOSTDEVICE T operator()(T arg_x) {
-    // this function is tanh approximation of gelu
-    MPType x = static_cast<MPType>(arg_x);
-    MPType one = static_cast<MPType>(1);
-    MPType half = static_cast<MPType>(0.5);
-    MPType kAlpha = static_cast<MPType>(M_2_SQRTPI * M_SQRT1_2);
-    auto tanh_out =
-        tanh(kAlpha * x * (one + static_cast<MPType>(GELU_CONSTANT) * x * x));
-    MPType out = x * half * (one + tanh_out);
-    return static_cast<T>(out);
-  }
-};
-
-template <typename T>
-struct GeluWithoutApproximateFunctor {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  inline HOSTDEVICE T operator()(T arg_x) {
-    // actual gelu with approximation = false
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(x * normcdf(x));
-  }
-};
-
-template <typename T>
-class GeluKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    auto approximate = context.Attr<bool>("approximate");
-    out->mutable_data<T>(in->place());
-
-    std::vector<const framework::Tensor*> ins = {in};
-    std::vector<framework::Tensor*> outs = {out};
-    const auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-
-    if (approximate) {
-#ifdef __NVCC__
-      if (std::is_same<T, platform::float16>::value) {
-        size_t n = in->numel();
-        const auto* in_ptr = reinterpret_cast<const __half*>(in->data<T>());
-        auto* out_ptr = reinterpret_cast<__half*>(out->data<T>());
-        if (TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(dev_ctx, in_ptr,
-                                                        out_ptr, n)) {
-          return;
-        }
-      }
-#endif
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     T, T>(
-          dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor<T>());
-    } else {
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     T, T>(
-          dev_ctx, ins, &outs, 0, GeluWithoutApproximateFunctor<T>());
-    }
-  }
-};
-
-template <typename T>
-struct GeluWithApproximateGradFunctor {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  inline HOSTDEVICE T operator()(T arg_x, T arg_dout) {
-    MPType x = static_cast<MPType>(arg_x);
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType one = static_cast<MPType>(1);
-    MPType half = static_cast<MPType>(0.5);
-    MPType kAlpha = static_cast<MPType>(M_2_SQRTPI * M_SQRT1_2);
-    MPType kBeta =
-        kAlpha * static_cast<MPType>(GELU_CONSTANT) * static_cast<MPType>(3);
-    auto cube_x = x * x * x;
-    auto tanh_out =
-        tanh(kAlpha * ((static_cast<MPType>(GELU_CONSTANT) * cube_x) + x));
-    auto ans =
-        half * (one + tanh_out +
-                (one - tanh_out * tanh_out) * (x * kAlpha + kBeta * cube_x));
-    return static_cast<T>(ans * dout);
-  }
-};
-
-template <typename T>
-struct GeluWithoutApproximateGradFunctor {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  inline HOSTDEVICE T operator()(T arg_x, T arg_dout) {
-    MPType x = static_cast<MPType>(arg_x);
-    MPType dout = static_cast<MPType>(arg_dout);
-    constexpr MPType kBeta = M_2_SQRTPI * M_SQRT1_2 * static_cast<MPType>(0.5);
-    const MPType cdf = normcdf(x);
-    const MPType pdf = exp(static_cast<MPType>(-0.5) * x * x) * kBeta;
-    return static_cast<T>(dout * (cdf + x * pdf));
-  }
-};
-
-template <typename T>
-class GeluGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* dout =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto approximate = context.Attr<bool>("approximate");
-    dx->mutable_data<T>(dout->place());
-
-    std::vector<const framework::Tensor*> ins = {x, dout};
-    std::vector<framework::Tensor*> outs = {dx};
-    const auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    if (approximate) {
-#ifdef __NVCC__
-      if (std::is_same<T, platform::float16>::value) {
-        size_t n = x->numel();
-        const auto* x_ptr = reinterpret_cast<const __half*>(x->data<T>());
-        const auto* y_g_ptr = reinterpret_cast<const __half*>(dout->data<T>());
-        auto* x_g_ptr = reinterpret_cast<__half*>(dx->data<T>());
-        if (TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(dev_ctx, x_ptr, y_g_ptr,
-                                                        x_g_ptr, n)) {
-          return;
-        }
-      }
-#endif
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     T, T>(
-          dev_ctx, ins, &outs, 0, GeluWithApproximateGradFunctor<T>());
-    } else {
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     T, T>(
-          dev_ctx, ins, &outs, 0, GeluWithoutApproximateGradFunctor<T>());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    gelu, ops::GeluKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GeluKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::GeluKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    gelu_grad, ops::GeluGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GeluGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::GeluGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>);
diff --git a/paddle/fluid/operators/gelu_op.h b/paddle/fluid/operators/gelu_op.h
deleted file mode 100644
index d4fed8a868ff9..0000000000000
--- a/paddle/fluid/operators/gelu_op.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifndef _USE_MATH_DEFINES
-#define _USE_MATH_DEFINES
-#endif
-#include <algorithm>
-#include <cmath>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-#define GELU_CONSTANT 0.044715
-
-template <typename T>
-struct GeluFunctor {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out, bool approximate) const {
-    if (approximate) {
-      // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2 / \pi) * (x + 0.044715 * x^{3})))
-      if (std::is_same<T, platform::float16>::value) {
-        VLOG(4) << "cast from float16 to float before computing";
-        auto casted_x = x.template cast<float>();
-        auto temp =
-            (static_cast<float>(M_2_SQRTPI * M_SQRT1_2) *
-             (casted_x + static_cast<float>(GELU_CONSTANT) * casted_x.cube()))
-                .tanh();
-        out.device(d) = (casted_x * static_cast<float>(0.5) *
-                         (static_cast<float>(1) + temp))
-                            .template cast<T>();
-      } else {
-        auto temp = (static_cast<T>(M_2_SQRTPI * M_SQRT1_2) *
-                     (x + static_cast<T>(GELU_CONSTANT) * x.cube()))
-                        .tanh();
-        out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
-      }
-    } else {
-#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
-    !defined(PADDLE_WITH_HIP)
-      auto x_data = x.data();
-      auto out_data = out.data();
-      int n = std::min(x.size(), out.size());
-
-      std::memset(out_data, 0, n * sizeof(T));
-      phi::funcs::CBlas<T>::AXPY(n, static_cast<T>(M_SQRT1_2), x_data, 1,
-                                 out_data, 1);
-      phi::funcs::CBlas<T>::VMERF(n, out_data, out_data, VML_LA);
-      for (int i = 0; i < n; i++) {
-        out_data[i] += static_cast<T>(1);
-      }
-      phi::funcs::CBlas<T>::VMUL(n, x_data, out_data, out_data);
-      for (int i = 0; i < n; i++) {
-        out_data[i] *= static_cast<T>(0.5);
-      }
-#else
-      // gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
-      if (std::is_same<T, platform::float16>::value) {
-        VLOG(4) << "cast from float16 to float before computing";
-        auto casted_x = x.template cast<float>();
-        auto temp = (casted_x * static_cast<float>(M_SQRT1_2)).erf();
-        out.device(d) = (casted_x * static_cast<float>(0.5) *
-                         (static_cast<float>(1) + temp))
-                            .template cast<T>();
-      } else {
-        auto temp = (x * static_cast<T>(M_SQRT1_2)).erf();
-        out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
-      }
-#endif
-    }
-  }
-};
-
-template <typename T>
-struct GeluGradFunctor {
-  template <typename Device, typename X, typename dOut, typename dX>
-  void operator()(Device d, X x, dOut dout, dX dx, bool approximate) const {
-    if (approximate) {
-      if (std::is_same<T, platform::float16>::value) {
-        VLOG(4) << "cast from float16 to float before computing";
-        auto casted_x = x.template cast<float>();
-        auto casted_dout = dout.template cast<float>();
-
-        const float kAlpha = static_cast<float>(M_2_SQRTPI * M_SQRT1_2);
-        const float kBeta =
-            kAlpha * static_cast<float>(GELU_CONSTANT) * static_cast<float>(3);
-        const auto y =
-            (kAlpha *
-             ((static_cast<float>(GELU_CONSTANT) * casted_x.cube()) + casted_x))
-                .tanh();
-        dx.device(d) = (static_cast<float>(0.5) * casted_dout *
-                        (static_cast<float>(1) + y +
-                         (casted_x - casted_x * y.square()) *
-                             (kAlpha + kBeta * casted_x.square())))
-                           .template cast<T>();
-      } else {
-        const T kAlpha = static_cast<T>(M_2_SQRTPI * M_SQRT1_2);
-        const T kBeta =
-            kAlpha * static_cast<T>(GELU_CONSTANT) * static_cast<T>(3);
-        const auto y =
-            (kAlpha * ((static_cast<T>(GELU_CONSTANT) * x.cube()) + x)).tanh();
-        dx.device(d) = static_cast<T>(0.5) * dout *
-                       (static_cast<T>(1) + y +
-                        (x - x * y.square()) * (kAlpha + kBeta * x.square()));
-      }
-    } else {
-#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
-    !defined(PADDLE_WITH_HIP)
-      auto x_data = x.data();
-      auto dx_data = dx.data();
-      auto dout_data = dout.data();
-      int n = std::min(x.size(), dx.size());
-
-      auto first = static_cast<T*>(std::malloc(n * sizeof(T)));
-      std::memset(first, 0, n * sizeof(T));
-      auto second = static_cast<T*>(std::malloc(n * sizeof(T)));
-      std::memset(second, 0, n * sizeof(T));
-
-      // first = (0.5 * (1 + erf(x / sqrt(2))))
-      phi::funcs::CBlas<T>::AXPY(n, static_cast<T>(M_SQRT1_2), x_data, 1, first,
-                                 1);
-      phi::funcs::CBlas<T>::VMERF(n, first, first, VML_LA);
-      for (int i = 0; i < n; i++) {
-        first[i] += static_cast<T>(1);
-      }
-      phi::funcs::CBlas<T>::SCAL(n, static_cast<T>(0.5), first, 1);
-
-      // second = (0.5 * 2/sqrt(pi) * 1/sqrt(2) * x * exp(-0.5 * x^2))
-      phi::funcs::CBlas<T>::VSQUARE(n, x_data, second);
-      phi::funcs::CBlas<T>::SCAL(n, -static_cast<T>(0.5), second, 1);
-      phi::funcs::CBlas<T>::VEXP(n, second, second);
-      phi::funcs::CBlas<T>::VMUL(n, x_data, second, second);
-      phi::funcs::CBlas<T>::SCAL(
-          n, static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2), second, 1);
-
-      // dx = dout * (first + second);
-      phi::funcs::CBlas<T>::VADD(n, first, second, first);
-      phi::funcs::CBlas<T>::VMUL(n, dout_data, first, dx_data);
-
-      std::free(first);
-      std::free(second);
-#else
-      // gelu_grad(x) = dout * 0.5 * (1 + erf(x / sqrt(2)) + x * sqrt(2 / pi) *
-      // exp(- x^2 / 2)
-      if (std::is_same<T, platform::float16>::value) {
-        VLOG(4) << "cast from float16 to float before computing";
-        auto casted_x = x.template cast<float>();
-        auto casted_dout = dout.template cast<float>();
-        auto first = static_cast<float>(0.5) *
-                     (static_cast<float>(1) +
-                      ((casted_x * static_cast<float>(M_SQRT1_2)).erf()));
-        auto second = static_cast<float>(0.5 * M_2_SQRTPI * M_SQRT1_2) *
-                      casted_x *
-                      (-static_cast<float>(0.5) * casted_x.square()).exp();
-        dx.device(d) = (casted_dout * (first + second)).template cast<T>();
-      } else {
-        auto first =
-            static_cast<T>(0.5) *
-            (static_cast<T>(1) + ((x * static_cast<T>(M_SQRT1_2)).erf()));
-
-        auto second = static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2) * x *
-                      (-static_cast<T>(0.5) * x.square()).exp();
-        dx.device(d) = dout * (first + second);
-      }
-#endif
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GeluKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    auto approximate = context.Attr<bool>("approximate");
-    out->mutable_data<T>(in->place());
-
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    GeluFunctor<T> functor;
-    functor(place, eigen_in, eigen_out, approximate);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GeluGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* dout =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto approximate = context.Attr<bool>("approximate");
-    dx->mutable_data<T>(dout->place());
-
-    auto eigen_x = framework::EigenVector<T>::Flatten(*x);
-    auto eigen_dout = framework::EigenVector<T>::Flatten(*dout);
-    auto eigen_dx = framework::EigenVector<T>::Flatten(*dx);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    GeluGradFunctor<T> functor;
-    functor(place, eigen_x, eigen_dout, eigen_dx, approximate);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc
index 18bbc7f4929c6..c5297dd9cd404 100644
--- a/paddle/fluid/operators/gelu_op_npu.cc
+++ b/paddle/fluid/operators/gelu_op_npu.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/operators/gelu_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc
index 00ff7ad2166dc..b132b3170756d 100644
--- a/paddle/fluid/operators/gelu_op_npu_test.cc
+++ b/paddle/fluid/operators/gelu_op_npu_test.cc
@@ -24,14 +24,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(gelu);
+USE_OP_ITSELF(gelu);
 USE_OP_DEVICE_KERNEL(gelu, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/gelu_op_xpu.cc b/paddle/fluid/operators/gelu_op_xpu.cc
index b8c2e9becf295..559d2448ad945 100644
--- a/paddle/fluid/operators/gelu_op_xpu.cc
+++ b/paddle/fluid/operators/gelu_op_xpu.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
-
-#include "paddle/fluid/operators/gelu_op.h"
-
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc
index b759345eda565..f7c006dbcb1a9 100644
--- a/paddle/fluid/operators/graph_send_recv_op.cc
+++ b/paddle/fluid/operators/graph_send_recv_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,59 +24,6 @@ class GraphSendRecvOP : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GraphSendRecv");
-    OP_INOUT_CHECK(ctx->HasInput("Src_index"), "Input", "Src_index",
-                   "GraphSendRecv");
-    OP_INOUT_CHECK(ctx->HasInput("Dst_index"), "Input", "Dst_index",
-                   "GraphSendRecv");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GraphSendRecv");
-
-    auto src_index_dims = ctx->GetInputDim("Src_index");
-    if (src_index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(src_index_dims[1], 1,
-                        platform::errors::InvalidArgument(
-                            "The last dim of Src_index should be 1 when it "
-                            "is 2D, but we get %d",
-                            src_index_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          src_index_dims.size(), 1,
-          platform::errors::InvalidArgument(
-              "The Src_index should be 1D, when it is not 2D, but we get %d",
-              src_index_dims.size()));
-    }
-
-    auto dst_index_dims = ctx->GetInputDim("Dst_index");
-    if (dst_index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(dst_index_dims[1], 1,
-                        platform::errors::InvalidArgument(
-                            "The last dim of Dst_index should be 1 when it "
-                            "is 2D, but we get %d",
-                            dst_index_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          dst_index_dims.size(), 1,
-          platform::errors::InvalidArgument("The Dst_index should be 1D, "
-                                            "when it is not 2D, but we get %d",
-                                            dst_index_dims.size()));
-    }
-
-    PADDLE_ENFORCE_EQ(
-        src_index_dims[0], dst_index_dims[0],
-        platform::errors::InvalidArgument(
-            "Src_index and Dst_index should have the same shape."));
-
-    auto dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", dims);
-
-    if (ctx->Attrs().Get<std::string>("pool_type") == "MEAN") {
-      OP_INOUT_CHECK(ctx->HasOutput("Dst_count"), "Output", "Dst_count",
-                     "GraphSendRecv");
-      ctx->SetOutputDim("Dst_count", {dims[0]});
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -164,10 +114,12 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
 
+DECLARE_INFER_SHAPE_FUNCTOR(graph_send_recv, GraphSendRecvInferShapeFunctor,
+                            PD_INFER_META(phi::GraphSendRecvInferMeta));
 REGISTER_OPERATOR(graph_send_recv, ops::GraphSendRecvOP,
                   ops::GraphSendRecvOpMaker,
                   ops::GraphSendRecvGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GraphSendRecvGradOpMaker<paddle::imperative::OpBase>);
+                  ops::GraphSendRecvGradOpMaker<paddle::imperative::OpBase>,
+                  GraphSendRecvInferShapeFunctor);
 REGISTER_OPERATOR(graph_send_recv_grad, ops::GraphSendRecvGradOp);
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index 04aa6a3e10f6e..f6d3fd8984691 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -12,12 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/grid_sampler_op.h"
 #include <memory>
 #include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -27,43 +31,6 @@ using Tensor = framework::Tensor;
 class GridSampleOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GridSampler");
-    OP_INOUT_CHECK(ctx->HasInput("Grid"), "Input", "Grid", "GridSampler");
-    OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "GridSampler");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto grid_dims = ctx->GetInputDim("Grid");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of GridSampleOp should be 4-D Tensor, but "
-                          "received X dimension size(%d)",
-                          x_dims.size()));
-    PADDLE_ENFORCE_EQ(grid_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "Input(Grid) of GridSampleOp should be 4-D Tensor, "
-                          "but received X dimension size(%d)",
-                          grid_dims.size()));
-    if (ctx->IsRuntime() || grid_dims[3] > 0) {
-      PADDLE_ENFORCE_EQ(
-          grid_dims[3], 2,
-          platform::errors::InvalidArgument(
-              "Input(Grid) dimension[3] should be 2, but received %d",
-              grid_dims[3]));
-    }
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          grid_dims[0], x_dims[0],
-          platform::errors::InvalidArgument(
-              "Input(X) and Input(Grid) dimension[0] should be equal, but "
-              "received X dimension[0](%d) != Grid dimension[0](%d)",
-              x_dims[0], grid_dims[0]));
-    }
-
-    ctx->SetOutputDim("Output",
-                      {x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]});
-    ctx->ShareLoD("X", "Output");
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -173,18 +140,6 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
 class GridSampleOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
-                   framework::GradVarName("X"), "grid_sampler");
-    auto input_dims = ctx->GetInputDim("X");
-    auto grid_dims = ctx->GetInputDim("Grid");
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
-    }
-    if (ctx->HasOutput(framework::GradVarName("Grid"))) {
-      ctx->SetOutputDim(framework::GradVarName("Grid"), grid_dims);
-    }
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -224,19 +179,16 @@ class GridSampleGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler, GridSamplerInferShapeFunctor,
+                            PD_INFER_META(phi::GridSampleBaseInferMeta));
 REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker,
                   ops::GridSampleGradMaker<paddle::framework::OpDesc>,
-                  ops::GridSampleGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    grid_sampler,
-    ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    grid_sampler_grad,
-    ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, double>);
+                  ops::GridSampleGradMaker<paddle::imperative::OpBase>,
+                  GridSamplerInferShapeFunctor);
+DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler_grad, GridSamplerGradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralBinaryGradInferMeta));
+REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad,
+                  GridSamplerGradInferShapeFunctor);
 
 REGISTER_OP_VERSION(grid_sampler)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu
deleted file mode 100644
index a227a8e312765..0000000000000
--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ /dev/null
@@ -1,492 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/grid_sampler_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-static __forceinline__ __device__ bool in_bounds(int h, int w, int H, int W) {
-  return h >= 0 && h < H && w >= 0 && w < W;
-}
-
-template <typename T>
-static __forceinline__ __device__ void atomic_add(T* data, int h, int w, int sH,
-                                                  int sW, int H, int W,
-                                                  T delta) {
-  if (in_bounds(h, w, H, W)) {
-    platform::CudaAtomicAdd(data + h * sH + w * sW, delta);
-  }
-}
-
-template <typename T>
-static __forceinline__ __device__ T _unnormalize(T coord, int size,
-                                                 bool align_corners) {
-  if (align_corners) {
-    return ((coord + 1.f) / 2) * (size - 1);
-  } else {
-    return ((coord + 1.f) * size - 1) / 2;
-  }
-}
-
-template <typename T>
-static __forceinline__ __device__ T clip_indexes(T in, int max_value) {
-  return min(static_cast<T>(max_value), max(in, static_cast<T>(0)));
-}
-
-template <typename T>
-static __forceinline__ __device__ T reflect_indexes(T in, int twice_low,
-                                                    int twice_high) {
-  if (twice_low == twice_high) {
-    return static_cast<T>(0);
-  }
-  T min = static_cast<T>(twice_low) / 2;
-  T span = static_cast<T>(twice_high - twice_low) / 2;
-  in = fabs(in - min);
-  T extra = fmod(in, span);
-  int flips = static_cast<int>(floor(in / span));
-  if (flips % 2 == 0) {
-    return extra + min;
-  } else {
-    return span - extra + min;
-  }
-}
-
-template <typename T>
-static __forceinline__ __device__ T compute_positions(T coord, int size,
-                                                      PaddingMode padding_mode,
-                                                      bool align_corners) {
-  coord = _unnormalize<T>(coord, size, align_corners);
-  if (padding_mode == PaddingMode::border) {
-    coord = clip_indexes(coord, size - 1);
-  } else if (padding_mode == PaddingMode::reflect) {
-    if (align_corners) {
-      coord = reflect_indexes(coord, 0, 2 * (size - 1));
-    } else {
-      coord = reflect_indexes(coord, -1, 2 * size - 1);
-    }
-    coord = clip_indexes(coord, size - 1);
-  }
-  return coord;
-}
-
-template <typename T>
-static __forceinline__ __device__ T _unnormalize_with_mask(T coord, int size,
-                                                           bool align_corners,
-                                                           T* grad_in) {
-  if (align_corners) {
-    *grad_in = static_cast<T>(size - 1) / 2;
-    return ((coord + 1.f) / 2) * (size - 1);
-  } else {
-    *grad_in = static_cast<T>(size) / 2;
-    return ((coord + 1.f) * size - 1) / 2;
-  }
-}
-
-template <typename T>
-static __forceinline__ __device__ T clip_indexes_with_mask(T in, int clip_limit,
-                                                           T* grad_in) {
-  if (in <= static_cast<T>(0)) {
-    *grad_in = static_cast<T>(0);
-    return static_cast<T>(0);
-  } else {
-    T max = static_cast<T>(clip_limit - 1);
-    if (in >= max) {
-      *grad_in = static_cast<T>(0);
-      return max;
-    } else {
-      *grad_in = static_cast<T>(1);
-      return in;
-    }
-  }
-}
-
-template <typename T>
-static __forceinline__ __device__ T
-reflect_indexes_with_mask(T in, int twice_low, int twice_high, T* grad_in) {
-  if (twice_low == twice_high) {
-    *grad_in = static_cast<T>(0);
-    return static_cast<T>(0);
-  }
-  int grad_in_mult_;
-  T min = static_cast<T>(twice_low) / 2;
-  T span = static_cast<T>(twice_high - twice_low) / 2;
-  in = in - min;
-  if (in < static_cast<T>(0)) {
-    grad_in_mult_ = -1;
-    in = -in;
-  } else {
-    grad_in_mult_ = 1;
-  }
-  T extra = fmod(in, span);
-  int flips = static_cast<int>(floor(in / span));
-  if (flips % 2 == 0) {
-    *grad_in = static_cast<T>(grad_in_mult_);
-    return extra + min;
-  } else {
-    *grad_in = static_cast<T>(-grad_in_mult_);
-    return span - extra + min;
-  }
-}
-
-template <typename T>
-static __forceinline__ __device__ T
-compute_positions_with_mask(T coord, int size, PaddingMode padding_mode,
-                            bool align_corners, T* grad_in) {
-  T grad_clip, grad_refl;
-  coord = _unnormalize_with_mask<T>(coord, size, align_corners, grad_in);
-  if (padding_mode == PaddingMode::border) {
-    coord = clip_indexes_with_mask(coord, size, &grad_clip);
-    *grad_in = (*grad_in) * grad_clip;
-  } else if (padding_mode == PaddingMode::reflect) {
-    if (align_corners) {
-      coord = reflect_indexes_with_mask(coord, 0, 2 * (size - 1), &grad_refl);
-    } else {
-      coord = reflect_indexes_with_mask(coord, -1, 2 * size - 1, &grad_refl);
-    }
-    coord = clip_indexes_with_mask(coord, size, &grad_clip);
-    *grad_in = (*grad_in) * grad_refl * grad_clip;
-  }
-
-  return coord;
-}
-
-template <typename T>
-__global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c,
-                                        int out_h, int out_w, int in_h,
-                                        int in_w, const T* input, const T* grid,
-                                        T* output, const Mode mode,
-                                        const PaddingMode padding_mode,
-                                        bool align_corners) {
-  int inp_sN = out_c * in_h * in_w;
-
-  int inp_sC = in_h * in_w;
-  int inp_sH = in_w;
-  int inp_sW = 1;
-  int grid_sN = out_h * out_w * 2;
-  int grid_sH = out_w * 2;
-  int grid_sW = 2;
-  int grid_sCoor = 1;
-  int out_sN = out_c * out_h * out_w;
-  int out_sC = out_h * out_w;
-  int out_sH = out_w;
-  int out_sW = 1;
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int w = index % out_w;
-    const int h = (index / out_w) % out_h;
-    const int n = index / (out_h * out_w);
-    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
-
-    T ix = grid[grid_offset];
-    T iy = grid[grid_offset + grid_sCoor];
-
-    ix = compute_positions(ix, in_w, padding_mode, align_corners);
-    iy = compute_positions(iy, in_h, padding_mode, align_corners);
-    if (mode == Mode::bilinear) {
-      int ix_nw = static_cast<int>(floor(ix));
-      int iy_nw = static_cast<int>(floor(iy));
-      int ix_ne = ix_nw + 1;
-      int iy_ne = iy_nw;
-      int ix_sw = ix_nw;
-      int iy_sw = iy_nw + 1;
-      int ix_se = ix_nw + 1;
-      int iy_se = iy_nw + 1;
-
-      T nw = (ix_se - ix) * (iy_se - iy);
-      T ne = (ix - ix_sw) * (iy_sw - iy);
-      T sw = (ix_ne - ix) * (iy - iy_ne);
-      T se = (ix - ix_nw) * (iy - iy_nw);
-
-      auto inp_offset_NC = n * inp_sN;
-
-      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
-      for (int c = 0; c < out_c;
-           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
-        *out_ptr_NCHW = static_cast<T>(0);
-        if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
-          *out_ptr_NCHW +=
-              input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw;
-        }
-        if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
-          *out_ptr_NCHW +=
-              input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne;
-        }
-        if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
-          *out_ptr_NCHW +=
-              input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw;
-        }
-        if (in_bounds(iy_se, ix_se, in_h, in_w)) {
-          *out_ptr_NCHW +=
-              input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se;
-        }
-      }
-    } else if (mode == Mode::nearest) {
-      int ix_nearest = static_cast<int>(std::nearbyint(ix));
-      int iy_nearest = static_cast<int>(std::nearbyint(iy));
-      auto inp_offset_NC = n * inp_sN;
-      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
-      for (int c = 0; c < out_c;
-           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
-        if (in_bounds(iy_nearest, ix_nearest, in_h, in_w)) {
-          *out_ptr_NCHW =
-              input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW];
-        } else {
-          *out_ptr_NCHW = static_cast<T>(0);
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto align_corners = ctx.Attr<bool>("align_corners");
-    auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
-    auto mode_s = ctx.Attr<std::string>("mode");
-    PaddingMode padding_mode;
-    Mode mode;
-    if (padding_mode_s == "border") {
-      padding_mode = PaddingMode::border;
-    } else if (padding_mode_s == "reflection") {
-      padding_mode = PaddingMode::reflect;
-    } else {
-      padding_mode = PaddingMode::zeros;
-    }
-
-    if (mode_s == "nearest") {
-      mode = Mode::nearest;
-    } else {
-      mode = Mode::bilinear;
-    }
-
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    const int n = grid->dims()[0];
-    const int out_h = grid->dims()[1];
-    const int out_w = grid->dims()[2];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-    VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
-            << "; out_w: " << out_w;
-    auto* output = ctx.Output<Tensor>("Output");
-    auto* output_data = output->mutable_data<T>(ctx.GetPlace());
-    VLOG(3) << "out dims: " << output->dims()[0] << "; " << output->dims()[1]
-            << "; " << output->dims()[2] << "; " << output->dims()[3];
-    int count = static_cast<int>(n * out_h * out_w);
-    auto cu_stream = dev_ctx.stream();
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(dev_ctx, count);
-    grid_sample_cuda_kernel<
-        T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
-        count, n, c, out_h, out_w, in_h, in_w, input->data<T>(),
-        grid->data<T>(), output_data, mode, padding_mode, align_corners);
-  }
-};
-
-template <typename T>
-__global__ void grid_sampler_cuda_backward_kernel(
-    const int nthreads, const T* grad_output, const T* input, const T* grid,
-    int n, int out_c, int out_h, int out_w, int in_h, int in_w, T* grad_input,
-    T* grad_grid, const Mode mode, const PaddingMode padding_mode,
-    bool align_corners) {
-  int inp_sN = out_c * in_h * in_w;
-  int inp_sC = in_h * in_w;
-  int inp_sH = in_w;
-  int inp_sW = 1;
-  int grid_sN = out_h * out_w * 2;
-  int grid_sH = out_w * 2;
-  int grid_sW = 2;
-  int grid_sCoor = 1;
-
-  int gOut_sN = out_c * out_h * out_w;
-  int gOut_sC = out_h * out_w;
-  int gOut_sH = out_w;
-  int gOut_sW = 1;
-
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int w = index % out_w;
-    const int h = (index / out_w) % out_h;
-    const int n = index / (out_h * out_w);
-    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
-
-    T ix = grid[grid_offset];
-    T iy = grid[grid_offset + grid_sCoor];
-
-    T gix_mult, giy_mult;
-    ix = compute_positions_with_mask(ix, in_w, padding_mode, align_corners,
-                                     &gix_mult);
-    iy = compute_positions_with_mask(iy, in_h, padding_mode, align_corners,
-                                     &giy_mult);
-
-    if (mode == Mode::bilinear) {
-      int ix_nw = static_cast<int>(floor(ix));
-      int iy_nw = static_cast<int>(floor(iy));
-      int ix_ne = ix_nw + 1;
-      int iy_ne = iy_nw;
-      int ix_sw = ix_nw;
-      int iy_sw = iy_nw + 1;
-      int ix_se = ix_nw + 1;
-      int iy_se = iy_nw + 1;
-
-      T nw = (ix_se - ix) * (iy_se - iy);
-      T ne = (ix - ix_sw) * (iy_sw - iy);
-      T sw = (ix_ne - ix) * (iy - iy_ne);
-      T se = (ix - ix_nw) * (iy - iy_nw);
-
-      T gix = static_cast<T>(0), giy = static_cast<T>(0);
-      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
-      T* gInp_ptr_NC = grad_input + n * inp_sN;
-      int inp_offset_NC = n * inp_sN;
-      for (int c = 0; c < out_c; ++c, inp_offset_NC += inp_sC,
-               gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
-        T gOut = grad_output[gOut_offset];
-
-        atomic_add(gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w,
-                   nw * gOut);
-        atomic_add(gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w,
-                   ne * gOut);
-        atomic_add(gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w,
-                   sw * gOut);
-        atomic_add(gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w,
-                   se * gOut);
-
-        if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
-          T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW];
-          gix -= nw_val * (iy_se - iy) * gOut;
-          giy -= nw_val * (ix_se - ix) * gOut;
-        }
-        if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
-          T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW];
-          gix += ne_val * (iy_sw - iy) * gOut;
-          giy -= ne_val * (ix - ix_sw) * gOut;
-        }
-        if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
-          T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW];
-          gix -= sw_val * (iy - iy_ne) * gOut;
-          giy += sw_val * (ix_ne - ix) * gOut;
-        }
-        if (in_bounds(iy_se, ix_se, in_h, in_w)) {
-          T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW];
-          gix += se_val * (iy - iy_nw) * gOut;
-          giy += se_val * (ix - ix_nw) * gOut;
-        }
-      }
-
-      if (grad_grid != nullptr) {
-        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
-        gGrid_ptr_NHW[0] = gix_mult * gix;
-        gGrid_ptr_NHW[1] = giy_mult * giy;
-      }
-    } else if (mode == Mode::nearest) {
-      int ix_nearest = static_cast<int>(std::nearbyint(ix));
-      int iy_nearest = static_cast<int>(std::nearbyint(iy));
-
-      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
-      T* gInp_ptr_NC = grad_input + n * inp_sN;
-      for (int c = 0; c < out_c;
-           ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
-        atomic_add(gInp_ptr_NC, iy_nearest, ix_nearest, inp_sH, inp_sW, in_h,
-                   in_w, grad_output[gOut_offset]);
-      }
-
-      if (grad_grid != nullptr) {
-        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
-        gGrid_ptr_NHW[0] = static_cast<T>(0);
-        gGrid_ptr_NHW[1] = static_cast<T>(0);
-      }
-    }
-  }
-}
-
-template <typename T>
-class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto align_corners = ctx.Attr<bool>("align_corners");
-    auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
-    auto mode_s = ctx.Attr<std::string>("mode");
-
-    PaddingMode padding_mode;
-    Mode mode;
-    if (padding_mode_s == "border") {
-      padding_mode = PaddingMode::border;
-    } else if (padding_mode_s == "reflection") {
-      padding_mode = PaddingMode::reflect;
-    } else {
-      padding_mode = PaddingMode::zeros;
-    }
-
-    if (mode_s == "nearest") {
-      mode = Mode::nearest;
-    } else {
-      mode = Mode::bilinear;
-    }
-
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-
-    const int n = grid->dims()[0];
-    const int out_h = grid->dims()[1];
-    const int out_w = grid->dims()[2];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>(ctx.GetPlace());
-    phi::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
-        ctx.template device_context<paddle::platform::CUDADeviceContext>(),
-        input_grad, static_cast<T>(0));
-
-    T* grid_grad_data = nullptr;
-    if (ctx.HasOutput(framework::GradVarName("Grid"))) {
-      auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
-      grid_grad_data = grid_grad->mutable_data<T>(ctx.GetPlace());
-    }
-
-    int count = static_cast<int>(n * out_h * out_w);
-    auto cu_stream = dev_ctx.stream();
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(dev_ctx, count);
-    grid_sampler_cuda_backward_kernel<
-        T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
-        count, output_grad->data<T>(), input->data<T>(), grid->data<T>(), n, c,
-        out_h, out_w, in_h, in_w, input_grad->data<T>(), grid_grad_data, mode,
-        padding_mode, align_corners);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(grid_sampler, ops::GridSampleOpCUDAKernel<float>,
-                        ops::GridSampleOpCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(grid_sampler_grad,
-                        ops::GridSampleGradOpCUDAKernel<float>,
-                        ops::GridSampleGradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
deleted file mode 100644
index 93e96694270a4..0000000000000
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ /dev/null
@@ -1,600 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <iostream>
-#include <string>
-#include <utility>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/hostdevice.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-enum class Mode {
-  bilinear,
-  nearest,
-};
-
-enum class PaddingMode { zeros, border, reflect };
-
-using Tensor = framework::Tensor;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
-using Array3 = Eigen::DSizes<int64_t, 3>;
-using Array4 = Eigen::DSizes<int64_t, 4>;
-
-template <typename T>
-static inline bool isInBound(T x, T y, T x_max, T y_max) {
-  if (x < 0 || x > x_max || y < 0 || y > y_max) {
-    return false;
-  }
-  return true;
-}
-
-template <typename T>
-static inline void unnormalize(const platform::CPUDeviceContext& ctx,
-                               Tensor* grid_slice,
-                               const int max_val,  // height-1 or width-1
-                               bool align_corners) {
-  auto& place = *ctx.eigen_device();
-  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
-
-  if (!align_corners) {
-    auto factor = static_cast<T>((max_val + 1) * 0.5);
-    grid_slice_t.device(place) =
-        (grid_slice_t + static_cast<T>(1)) * factor - static_cast<T>(0.5);
-  } else {
-    auto factor = static_cast<T>(max_val * 0.5);
-    grid_slice_t.device(place) = (grid_slice_t + static_cast<T>(1)) * factor;
-  }
-}
-
-template <typename T>
-static inline void clip(const platform::CPUDeviceContext& ctx,
-                        Tensor* grid_slice,
-                        const int max_val,  // height-1 or width-1
-                        bool align_corners, std::string padding_mode) {
-  auto& place = *ctx.eigen_device();
-  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
-  if (padding_mode == "border") {
-    grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
-                                     .cwiseMin(static_cast<T>(max_val));
-  } else if (padding_mode == "reflection") {
-    if (align_corners) {
-      auto double_range = static_cast<T>(max_val * 2);
-      auto grid_abs = grid_slice_t.abs();
-      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
-      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
-      if (max_val == 0) {
-        grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
-      }
-    } else {
-      auto double_range = static_cast<T>((max_val + 1) * 2);
-      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
-      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
-      grid_slice_t.device(place) =
-          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
-      grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
-                                       .cwiseMin(static_cast<T>(max_val));
-    }
-  }
-}
-
-template <typename T>
-static inline void clipWithMask(const platform::CPUDeviceContext& ctx,
-                                const int max_val,  // height-1 or width-1
-                                bool align_corners, std::string padding_mode,
-                                Tensor* grid_slice, Tensor* grid_scale) {
-  auto& place = *ctx.eigen_device();
-  grid_scale->mutable_data<T>(grid_slice->dims(), ctx.GetPlace());
-
-  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
-  auto factor = static_cast<T>(max_val * 0.5);
-  if (!align_corners) {
-    factor = static_cast<T>((max_val + 1) * 0.5);
-  }
-  auto grid_scale_t = EigenTensor<T, 3>::From(*grid_scale).setConstant(factor);
-
-  if (padding_mode == "border") {
-    //    auto bounded_lo = grid_slice_t.cwiseMax(static_cast<T>(0));
-    auto res = grid_slice_t.cwiseMax(static_cast<T>(0))
-                   .cwiseMin(static_cast<T>(max_val));
-
-    auto in_bound = (res == grid_slice_t);
-    grid_scale_t.device(place) = grid_scale_t * in_bound.template cast<T>();
-    grid_slice_t.device(place) = res;
-  } else if (padding_mode == "reflection") {
-    if (align_corners) {
-      auto double_range = static_cast<T>(max_val * 2);
-      auto is_neg = (grid_slice_t < static_cast<T>(0));
-      auto grid_abs = grid_slice_t.abs();
-      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
-      auto one_more_flip = (extra > (double_range - extra));
-      grid_scale_t.device(place) =
-          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
-                          (is_neg != one_more_flip).template cast<T>());
-      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
-      if (max_val == 0) {
-        grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
-      }
-    } else {
-      auto double_range = static_cast<T>((max_val + 1) * 2);
-      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
-      auto is_neg = ((grid_slice_t + static_cast<T>(0.5)) < static_cast<T>(0));
-      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
-      auto one_more_flip = (extra > (double_range - extra));
-      auto reflected =
-          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
-      auto clipped = reflected.cwiseMax(static_cast<T>(0))
-                         .cwiseMin(static_cast<T>(max_val));
-      auto in_bound = (clipped == reflected).template cast<T>();
-      grid_scale_t.device(place) =
-          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
-                          (is_neg != one_more_flip).template cast<T>()) *
-          in_bound;
-      grid_slice_t.device(place) = clipped;
-    }
-  }
-}
-
-template <typename T>
-static void calcGridLocations(const platform::CPUDeviceContext& ctx,
-                              const Tensor& grid, const int in_h,
-                              const int in_w, bool align_corners,
-                              std::string padding_mode, Tensor* grid_x,
-                              Tensor* grid_y) {
-  const int n = grid.dims()[0];
-  const int out_h = grid.dims()[1];
-  const int out_w = grid.dims()[2];
-
-  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
-  T* grid_x_data = grid_x->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  T* grid_y_data = grid_y->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  const T* grid_data = grid.data<T>();
-  for (int i = 0; i < n * out_h * out_w; i++) {
-    grid_x_data[i] = grid_data[2 * i];
-    grid_y_data[i] = grid_data[(2 * i) + 1];
-  }
-
-  unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
-  unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
-
-  clip<T>(ctx, grid_x, in_w - 1, align_corners, padding_mode);
-  clip<T>(ctx, grid_y, in_h - 1, align_corners, padding_mode);
-}
-
-template <typename T>
-static void calcGridLocationsWithGrad(const platform::CPUDeviceContext& ctx,
-                                      const Tensor& grid, const int in_h,
-                                      const int in_w, bool align_corners,
-                                      std::string padding_mode, Tensor* grid_x,
-                                      Tensor* grid_y, Tensor* grid_x_scale,
-                                      Tensor* grid_y_scale) {
-  const int n = grid.dims()[0];
-  const int out_h = grid.dims()[1];
-  const int out_w = grid.dims()[2];
-
-  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
-  T* grid_x_data = grid_x->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  T* grid_y_data = grid_y->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-
-  const T* grid_data = grid.data<T>();
-  for (int i = 0; i < n * out_h * out_w; i++) {
-    grid_x_data[i] = grid_data[2 * i];
-    grid_y_data[i] = grid_data[(2 * i) + 1];
-  }
-
-  unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
-  unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
-
-  clipWithMask<T>(ctx, in_w - 1, align_corners, padding_mode, grid_x,
-                  grid_x_scale);
-  clipWithMask<T>(ctx, in_h - 1, align_corners, padding_mode, grid_y,
-                  grid_y_scale);
-}
-
-template <typename T>
-static void getGridPointValue(const Tensor& input, Tensor* output,
-                              const Tensor& x, const Tensor& y) {
-  const int n = input.dims()[0];
-  const int c = input.dims()[1];
-  const int in_h = input.dims()[2];
-  const int in_w = input.dims()[3];
-  const int out_h = x.dims()[1];
-  const int out_w = x.dims()[2];
-  auto x_t = EigenTensor<T, 3>::From(x);
-  auto y_t = EigenTensor<T, 3>::From(y);
-  auto output_t = EigenTensor<T, 4>::From(*output).setConstant((T)0);
-  auto input_t = EigenTensor<T, 4>::From(input);
-
-  for (int i = 0; i < n; i++) {
-    for (int k = 0; k < out_h; k++) {
-      for (int l = 0; l < out_w; l++) {
-        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
-                      (T)(in_h - 1))) {
-          for (int j = 0; j < c; j++) {
-            output_t(i, j, k, l) =
-                input_t(i, j, static_cast<int>(round(y_t(i, k, l))),
-                        static_cast<int>(round(x_t(i, k, l))));
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-static void allNeigbors(const platform::CPUDeviceContext& ctx,
-                        const Tensor& input, Tensor* grid_x, Tensor* grid_y,
-                        Tensor* x_w, Tensor* x_e, Tensor* y_n,
-                        Tensor* y_s,  // positions
-                        Tensor* d_w, Tensor* d_e, Tensor* d_n,
-                        Tensor* d_s,  // distance
-                        Tensor* v_wn, Tensor* v_en, Tensor* v_ws,
-                        Tensor* v_es) {  // values
-  auto& place = *ctx.eigen_device();
-
-  const int c = input.dims()[1];
-  const int n = grid_x->dims()[0];
-  const int out_h = grid_x->dims()[1];
-  const int out_w = grid_x->dims()[2];
-  // calculate coords of 4 corner points
-  x_w->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  x_e->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  y_n->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  y_s->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  auto x_w_t = EigenTensor<T, 3>::From(*x_w);
-  auto x_e_t = EigenTensor<T, 3>::From(*x_e);
-  auto y_n_t = EigenTensor<T, 3>::From(*y_n);
-  auto y_s_t = EigenTensor<T, 3>::From(*y_s);
-
-  auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
-  auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
-
-  x_w_t.device(place) = grid_x_t.floor();
-  x_e_t.device(place) = x_w_t + static_cast<T>(1);
-  y_n_t.device(place) = grid_y_t.floor();
-  y_s_t.device(place) = y_n_t + static_cast<T>(1);
-
-  // calculate distances to 4 sides
-  d_w->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  d_e->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  d_n->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  d_s->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  auto d_w_t = EigenTensor<T, 3>::From(*d_w);
-  auto d_e_t = EigenTensor<T, 3>::From(*d_e);
-  auto d_n_t = EigenTensor<T, 3>::From(*d_n);
-  auto d_s_t = EigenTensor<T, 3>::From(*d_s);
-  d_w_t.device(place) = grid_x_t - x_w_t;
-  d_e_t.device(place) = x_e_t - grid_x_t;
-  d_n_t.device(place) = grid_y_t - y_n_t;
-  d_s_t.device(place) = y_s_t - grid_y_t;
-
-  // calc 4 corner points value
-  v_wn->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-  v_en->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-  v_ws->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-  v_es->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-  getGridPointValue<T>(input, v_wn, *x_w, *y_n);
-  getGridPointValue<T>(input, v_en, *x_e, *y_n);
-  getGridPointValue<T>(input, v_ws, *x_w, *y_s);
-  getGridPointValue<T>(input, v_es, *x_e, *y_s);
-}
-
-template <typename T>
-static void bilinearInter(const platform::CPUDeviceContext& ctx,
-                          const Tensor& input, Tensor* grid_x, Tensor* grid_y,
-                          Tensor* out) {
-  auto& place = *ctx.eigen_device();
-  const int n = grid_x->dims()[0];
-  const int out_h = grid_x->dims()[1];
-  const int out_w = grid_x->dims()[2];
-  const int c = input.dims()[1];
-
-  Tensor x_w, x_e, y_n, y_s;
-  Tensor d_w, d_e, d_n, d_s;
-  Tensor v_wn, v_en, v_ws, v_es;
-
-  allNeigbors<T>(ctx, input, grid_x, grid_y, &x_w, &x_e, &y_n, &y_s, &d_w, &d_e,
-                 &d_n, &d_s, &v_wn, &v_en, &v_ws, &v_es);
-
-  auto d_w_t = EigenTensor<T, 3>::From(d_w);
-  auto d_e_t = EigenTensor<T, 3>::From(d_e);
-  auto d_n_t = EigenTensor<T, 3>::From(d_n);
-  auto d_s_t = EigenTensor<T, 3>::From(d_s);
-
-  auto d_w_scaled_t =
-      d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
-  auto d_e_scaled_t =
-      d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
-  auto d_n_scaled_t =
-      d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
-  auto d_s_scaled_t =
-      d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
-  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
-  auto v_en_t = EigenTensor<T, 4>::From(v_en);
-  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
-  auto v_es_t = EigenTensor<T, 4>::From(v_es);
-  auto output_t = EigenTensor<T, 4>::From(*out);
-  // bilinear interpolaetion by 4 corner points
-  output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
-                           v_en_t * d_w_scaled_t * d_s_scaled_t +
-                           v_ws_t * d_e_scaled_t * d_n_scaled_t +
-                           v_es_t * d_w_scaled_t * d_n_scaled_t;
-}
-
-template <typename T>
-static void nearestInter(const platform::CPUDeviceContext& ctx,
-                         const Tensor& input, Tensor* grid_x, Tensor* grid_y,
-                         Tensor* out) {
-  auto& place = *ctx.eigen_device();
-
-  auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
-  auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
-  grid_x_t = grid_x_t.round();
-  grid_y_t = grid_y_t.round();
-  getGridPointValue<T>(input, out, *grid_x, *grid_y);
-}
-
-template <typename T>
-static void gatherOutputGradToInputGrad(const Tensor& output_grad,
-                                        Tensor* input_grad, const Tensor& x,
-                                        const Tensor& y, const Tensor& d1,
-                                        const Tensor& d2) {
-  const int n = output_grad.dims()[0];
-  const int c = output_grad.dims()[1];
-  const int out_h = output_grad.dims()[2];
-  const int out_w = output_grad.dims()[3];
-  const int in_h = input_grad->dims()[2];
-  const int in_w = input_grad->dims()[3];
-  auto x_t = EigenTensor<T, 3>::From(x);
-  auto y_t = EigenTensor<T, 3>::From(y);
-  auto d1_t = EigenTensor<T, 3>::From(d1);
-  auto d2_t = EigenTensor<T, 3>::From(d2);
-  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
-  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
-
-  for (int i = 0; i < n; i++) {
-    for (int k = 0; k < out_h; k++) {
-      for (int l = 0; l < out_w; l++) {
-        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
-                      (T)(in_h - 1))) {
-          for (int j = 0; j < c; j++) {
-            input_grad_t(i, j, static_cast<int>(round(y_t(i, k, l))),
-                         static_cast<int>(round(x_t(i, k, l)))) +=
-                output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l);
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-static void gatherOutputGradToInputGrad(const Tensor& output_grad,
-                                        Tensor* input_grad, const Tensor& x,
-                                        const Tensor& y) {
-  const int n = output_grad.dims()[0];
-  const int c = output_grad.dims()[1];
-  const int out_h = output_grad.dims()[2];
-  const int out_w = output_grad.dims()[3];
-  const int in_h = input_grad->dims()[2];
-  const int in_w = input_grad->dims()[3];
-  auto x_t = EigenTensor<T, 3>::From(x);
-  auto y_t = EigenTensor<T, 3>::From(y);
-  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
-  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
-  for (int i = 0; i < n; i++) {
-    for (int k = 0; k < out_h; k++) {
-      for (int l = 0; l < out_w; l++) {
-        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
-                      (T)(in_h - 1))) {
-          for (int j = 0; j < c; j++) {
-            input_grad_t(i, j, static_cast<int>(round(y_t(i, k, l))),
-                         static_cast<int>(round(x_t(i, k, l)))) +=
-                output_grad_t(i, j, k, l);
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-static void gatherBilinearGrad(const platform::CPUDeviceContext& ctx,
-                               const Tensor& input, const Tensor& output_grad,
-                               Tensor* grid_x, Tensor* grid_y,
-                               Tensor* grid_x_scale, Tensor* grid_y_scale,
-                               Tensor* input_grad, Tensor* grid_grad) {
-  const int n = grid_x->dims()[0];
-  const int out_h = grid_x->dims()[1];
-  const int out_w = grid_x->dims()[2];
-  const int c = input.dims()[1];
-
-  Tensor x_w, x_e, y_n, y_s;
-  Tensor d_w, d_e, d_n, d_s;
-  Tensor v_wn, v_en, v_ws, v_es;
-
-  allNeigbors<T>(ctx, input,
-                 grid_x,  // grid_x
-                 grid_y,  // grid_y
-                 &x_w, &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s, &v_wn, &v_en,
-                 &v_ws, &v_es);
-
-  // gather output grad value to input grad by corner point coords and weight
-  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_n, d_e, d_s);
-  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_s, d_e, d_n);
-  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_n, d_w, d_s);
-  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_s, d_w, d_n);
-
-  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
-  auto v_en_t = EigenTensor<T, 4>::From(v_en);
-  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
-  auto v_es_t = EigenTensor<T, 4>::From(v_es);
-
-  auto d_w_t = EigenTensor<T, 3>::From(d_w);
-  auto d_e_t = EigenTensor<T, 3>::From(d_e);
-  auto d_n_t = EigenTensor<T, 3>::From(d_n);
-  auto d_s_t = EigenTensor<T, 3>::From(d_s);
-
-  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
-
-  if (grid_grad != nullptr) {
-    Tensor grid_grad_x, grid_grad_y;
-    grid_grad_x.mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-    grid_grad_y.mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-    auto grid_grad_x_t =
-        EigenTensor<T, 3>::From(grid_grad_x).setConstant(static_cast<T>(0.0));
-    auto grid_grad_y_t =
-        EigenTensor<T, 3>::From(grid_grad_y).setConstant(static_cast<T>(0.0));
-    for (int i = 0; i < n; i++) {
-      for (int j = 0; j < c; j++) {
-        for (int k = 0; k < out_h; k++) {
-          for (int l = 0; l < out_w; l++) {
-            grid_grad_x_t(i, k, l) +=
-                ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) +
-                 (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) *
-                output_grad_t(i, j, k, l);
-            grid_grad_y_t(i, k, l) +=
-                ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) +
-                 (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) *
-                output_grad_t(i, j, k, l);
-          }
-        }
-      }
-    }
-
-    //  const T x_max = static_cast<T>(in_w - 1);
-    //  const T y_max = static_cast<T>(in_h - 1);
-
-    auto grid_x_scale_t = EigenTensor<T, 3>::From(*grid_x_scale);
-    auto grid_y_scale_t = EigenTensor<T, 3>::From(*grid_y_scale);
-    grid_grad_x_t = grid_grad_x_t * grid_x_scale_t;
-    grid_grad_y_t = grid_grad_y_t * grid_y_scale_t;
-
-    // gather grid_grad [x, y] in 3rd Dim
-    T* grid_grad_data = grid_grad->data<T>();
-    T* grid_grad_x_data = grid_grad_x.data<T>();
-    T* grid_grad_y_data = grid_grad_y.data<T>();
-    for (int i = 0; i < n * out_h * out_w; i++) {
-      grid_grad_data[2 * i] = grid_grad_x_data[i];
-      grid_grad_data[2 * i + 1] = grid_grad_y_data[i];
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class GridSampleOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto align_corners = ctx.Attr<bool>("align_corners");
-    auto padding_mode = ctx.Attr<std::string>("padding_mode");
-    auto mode = ctx.Attr<std::string>("mode");
-
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-
-    const int n = grid->dims()[0];
-    const int out_h = grid->dims()[1];
-    const int out_w = grid->dims()[2];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-
-    auto* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), output,
-        static_cast<T>(0));
-
-    Tensor grid_x, grid_y;
-    calcGridLocations<T>(
-        ctx.template device_context<platform::CPUDeviceContext>(), *grid, in_h,
-        in_w, align_corners, padding_mode, &grid_x, &grid_y);
-    if (mode == "bilinear") {
-      bilinearInter<T>(
-          ctx.template device_context<platform::CPUDeviceContext>(), *input,
-          &grid_x, &grid_y, output);
-    } else if (mode == "nearest") {
-      auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
-      auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
-      grid_x_t = grid_x_t.round();
-      grid_y_t = grid_y_t.round();
-      getGridPointValue<T>(*input, output, grid_x, grid_y);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GridSampleGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto align_corners = ctx.Attr<bool>("align_corners");
-    auto padding_mode = ctx.Attr<std::string>("padding_mode");
-    auto mode = ctx.Attr<std::string>("mode");
-
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-
-    const int n = grid->dims()[0];
-    const int out_h = grid->dims()[1];
-    const int out_w = grid->dims()[2];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), input_grad,
-        static_cast<T>(0));
-
-    Tensor* grid_grad = nullptr;
-    if (ctx.HasOutput(framework::GradVarName("Grid"))) {
-      grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
-      grid_grad->mutable_data<T>({n, out_h, out_w, 2}, ctx.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T>()(
-          ctx.template device_context<DeviceContext>(), grid_grad,
-          static_cast<T>(0));
-    }
-
-    Tensor grid_x, grid_y;
-    Tensor grid_x_scale, grid_y_scale;
-    calcGridLocationsWithGrad<T>(
-        ctx.template device_context<platform::CPUDeviceContext>(), *grid, in_h,
-        in_w, align_corners, padding_mode, &grid_x, &grid_y, &grid_x_scale,
-        &grid_y_scale);
-    if (mode == "bilinear") {
-      gatherBilinearGrad<T>(ctx.template device_context<DeviceContext>(),
-                            *input, *output_grad, &grid_x, &grid_y,
-                            &grid_x_scale, &grid_y_scale, input_grad,
-                            grid_grad);
-    } else {
-      auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
-      auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
-      grid_x_t = grid_x_t.round();
-      grid_y_t = grid_y_t.round();
-      gatherOutputGradToInputGrad<T>(*output_grad, input_grad, grid_x, grid_y);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
index 2d284fb516e62..4331523d26edc 100644
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -167,9 +167,11 @@ class GroupNormGradOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     // check input
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GroupNormGrad");
     OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "GroupNormGrad");
     OP_INOUT_CHECK(ctx->HasInput("Variance"), "Input", "Variance",
                    "GroupNormGrad");
+    OP_INOUT_CHECK(ctx->HasInput("Mean"), "Input", "Mean", "GroupNormGrad");
     OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input",
                    framework::GradVarName("Y"), "GroupNormGrad");
 
@@ -216,10 +218,12 @@ class GroupNormGradMaker : public framework::SingleGradOpMaker<T> {
 
   void Apply(GradOpPtr<T> op) const override {
     op->SetType("group_norm_grad");
+    op->SetInput("X", this->Input("X"));
     op->SetInput("Scale", this->Input("Scale"));
     op->SetInput("Bias", this->Input("Bias"));
     op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
     op->SetInput("Y", this->Output("Y"));
+    op->SetInput("Mean", this->Output("Mean"));
     op->SetInput("Variance", this->Output("Variance"));
 
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index b376334f1e93c..ab8c50d90b8ec 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -81,46 +81,74 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
   CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var);
 }
 
-template <typename T, typename AccT, int VecSize>
-__device__ __forceinline__ void ThreadReduce(const T* input, int size,
-                                             const int offset, AccT* mean,
-                                             AccT* var) {
+template <typename T, typename AccT, int VecSize, int Num>
+__device__ __forceinline__ void ThreadReduce(phi::Array<const T*, Num> arrs,
+                                             int size, const int offset,
+                                             AccT* out_mean, AccT* out_var) {
+  const T* x = arrs[0];
+  const T* y;
+  if (Num == 2) {
+    y = arrs[1];
+  }
   using VecT = kps::details::VectorType<T, VecSize>;
   int tid = threadIdx.x;
   if (offset > 0) {
-    input -= offset;
+    x -= offset;
+    if (Num == 2) {
+      y -= offset;
+    }
     size += offset;
     if (tid >= offset) {
-      AccT temp = input[tid];
-      *mean += temp;
-      *var += temp * temp;
+      if (Num == 1) {
+        *out_mean += x[tid];
+        *out_var += x[tid] * x[tid];
+      } else if (Num == 2) {
+        *out_mean += y[tid];
+        *out_var += y[tid] * x[tid];
+      }
     }
     size -= blockDim.x;
-    input += blockDim.x;
+    x += blockDim.x;
+    if (Num == 2) {
+      y += blockDim.x;
+    }
   }
   int remain = size % (VecSize * blockDim.x);
 
-  T ins[VecSize];
-  VecT* ins_vec = reinterpret_cast<VecT*>(&ins);
+  T ins_x[VecSize];
+  T ins_y[VecSize];
+  VecT* ins_vec_x = reinterpret_cast<VecT*>(&ins_x);
+  VecT* ins_vec_y = reinterpret_cast<VecT*>(&ins_y);
 
   // vector part
   for (; VecSize * tid < (size - remain); tid += blockDim.x) {
-    *ins_vec = reinterpret_cast<const VecT*>(input)[tid];
+    *ins_vec_x = reinterpret_cast<const VecT*>(x)[tid];
+    if (Num == 2) {
+      *ins_vec_y = reinterpret_cast<const VecT*>(y)[tid];
+    }
 
 #pragma unroll
     for (int i = 0; i < VecSize; ++i) {
-      AccT temp = ins[i];
-      *mean += temp;
-      *var += temp * temp;
+      if (Num == 1) {
+        *out_mean += ins_x[i];
+        *out_var += ins_x[i] * ins_x[i];
+      } else if (Num == 2) {
+        *out_mean += ins_y[i];
+        *out_var += ins_y[i] * ins_x[i];
+      }
     }
   }
 
   // scalar part
   tid = size - remain + threadIdx.x;
   for (; tid < size; tid += blockDim.x) {
-    AccT temp = input[tid];
-    *mean += temp;
-    *var += temp * temp;
+    if (Num == 1) {
+      *out_mean += x[tid];
+      *out_var += x[tid] * x[tid];
+    } else if (Num == 2) {
+      *out_mean += y[tid];
+      *out_var += y[tid] * x[tid];
+    }
   }
 }
 
@@ -148,7 +176,10 @@ __global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var,
   AccT x_var = static_cast<AccT>(0);
   const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
   x += i * size;
-  ThreadReduce<T, AccT, VecSize>(x, size, input_offset, &x_mean, &x_var);
+  phi::Array<const T*, 1> ins;
+  ins[0] = x;
+  ThreadReduce<T, AccT, VecSize, 1>(ins, size, input_offset, &x_mean, &x_var);
+
   x_mean = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
       x_mean, kps::AddFunctor<AccT>());
   x_var = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
@@ -310,10 +341,12 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
 };
 
 template <typename T, int flags>
-__global__ void GroupNormBackwardGetMeanAndVar(
-    const T* x, const T* scale, const T* bias, const T* d_y, int N, int C,
-    int W, int imsize, int groups, int group_size, T epsilon, T* d_mean,
-    T* d_var, T* d_scale, T* d_bias, const DataLayout data_layout) {
+__global__ void GroupNormBackwardGetMeanAndVar(const T* x, const T* scale,
+                                               const T* bias, const T* d_y,
+                                               int N, int C, int W, int imsize,
+                                               int groups, int group_size,
+                                               T epsilon, T* d_mean, T* d_var,
+                                               T* d_scale, T* d_bias) {
   int gid = blockIdx.y;
   int cid = blockIdx.x;
   int bid = blockIdx.z;
@@ -329,15 +362,11 @@ __global__ void GroupNormBackwardGetMeanAndVar(
 
   for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
     T val, dval;
-    if (data_layout == DataLayout::kNCHW) {
-      val = x[(bid * C + ccid) * imsize + imid] - x_bias;
-      dval = d_y[(bid * C + ccid) * imsize + imid];
-    } else {
-      int hid = imid / W;
-      int wid = imid % W;
-      val = x[(bid * H + hid) * W * C + wid * C + ccid] - x_bias;
-      dval = d_y[(bid * H + hid) * W * C + wid * C + ccid];
-    }
+
+    int hid = imid / W;
+    int wid = imid % W;
+    val = x[(bid * H + hid) * W * C + wid * C + ccid] - x_bias;
+    dval = d_y[(bid * H + hid) * W * C + wid * C + ccid];
 
     d_var_data += val * dval;
     d_mean_data += dval * x_scale;
@@ -357,8 +386,7 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale,
                                   const T* bias, const T* var, const T* d_mean,
                                   const T* d_var, int N, int C, int W,
                                   int imsize, int groups, int group_size,
-                                  T epsilon, T* d_x,
-                                  const DataLayout data_layout) {
+                                  T epsilon, T* d_x) {
   int gid = blockIdx.y;
   int cid = blockIdx.x;
   int bid = blockIdx.z;
@@ -379,26 +407,142 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale,
   if (x_scale != 0) x_scale_inv = 1.0 / x_scale;
 
   for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
-    if (data_layout == DataLayout::kNCHW) {
-      T tmp = x[(bid * C + ccid) * imsize + imid];
-      T v_y = (tmp - x_bias) * x_scale_inv;
-      T dly = d_y[(bid * C + ccid) * imsize + imid];
-      d_x[(bid * C + ccid) * imsize + imid] =
-          x_var_inv *
-          (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean);
-    } else {
-      int hid = imid / W;
-      int wid = imid % W;
-      T tmp = x[(bid * H + hid) * W * C + wid * C + ccid];
-      T v_y = (tmp - x_bias) * x_scale_inv;
-      T dly = d_y[(bid * H + hid) * W * C + wid * C + ccid];
-      d_x[(bid * H + hid) * W * C + wid * C + ccid] =
-          x_var_inv *
-          (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean);
+    int hid = imid / W;
+    int wid = imid % W;
+    T tmp = x[(bid * H + hid) * W * C + wid * C + ccid];
+    T v_y = (tmp - x_bias) * x_scale_inv;
+    T dly = d_y[(bid * H + hid) * W * C + wid * C + ccid];
+    d_x[(bid * H + hid) * W * C + wid * C + ccid] =
+        x_var_inv *
+        (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean);
+  }
+}
+
+template <typename T, typename AccT, int VecSize>
+__global__ void VectorizedGetDsDbCUDAKernel(int imsize, const T* x, const T* dy,
+                                            T* ds, T* db) {
+  int i = blockIdx.x;
+  AccT ds_sum = static_cast<AccT>(0);
+  AccT db_sum = static_cast<AccT>(0);
+  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
+  x += i * imsize;
+
+  phi::Array<const T*, 2> ins;
+  ins[0] = x;
+  ins[1] = dy;
+  ThreadReduce<T, AccT, VecSize, 2>(ins, imsize, input_offset, &db_sum,
+                                    &ds_sum);
+
+  ds_sum = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
+      ds_sum, kps::AddFunctor<AccT>());
+  db_sum = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
+      db_sum, kps::AddFunctor<AccT>());
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    ds[i] = ds_sum;
+    db[i] = db_sum;
+  }
+}
+
+template <typename T>
+__global__ void ScalarGetDsDbCUDAKernel(int imsize, const T* x, const T* dy,
+                                        T* ds, T* db) {
+  const int nc = blockIdx.x;
+  T ds_sum = 0;
+  T db_sum = 0;
+  for (int i = threadIdx.x; i < imsize; i += blockDim.x) {
+    const int index = nc * imsize + i;
+    ds_sum += dy[index] * x[index];
+    db_sum += dy[index];
+  }
+  CudaAtomicAddWithWarp(&ds[nc], ds_sum);
+  CudaAtomicAddWithWarp(&db[nc], db_sum);
+}
+
+template <typename T>
+__global__ void GetScaleBiasGradientCUDAKernel(int N, int C, int group,
+                                               T epsilon, const T* mean,
+                                               const T* var, const T* ds,
+                                               const T* db, T* d_scale,
+                                               T* d_bias) {
+  const int c = blockIdx.x * blockDim.x + threadIdx.x;
+  if (c < C) {
+    const int G = group;
+    const int D = C / G;
+    T sum1 = 0;
+    T sum2 = 0;
+    for (int n = 0; n < N; ++n) {
+      const int nc = n * C + c;
+      const int ng = n * G + c / D;
+      sum1 += (d_scale == nullptr)
+                  ? T(0)
+                  : ((ds[nc] - db[nc] * static_cast<T>(mean[ng])) *
+                     static_cast<T>(rsqrt(var[ng] + epsilon)));
+      sum2 += (d_bias == nullptr) ? T(0) : db[nc];
+    }
+    if (d_scale != nullptr) {
+      d_scale[c] = sum1;
+    }
+    if (d_bias != nullptr) {
+      d_bias[c] = sum2;
     }
   }
 }
 
+template <typename T, int BlockDim>
+__global__ void GetBackwardParamsCUDAKernel(int imsize, int groups,
+                                            int group_size, T epsilon,
+                                            const T* mean, const T* var,
+                                            const T* scale, const T* ds,
+                                            const T* db, T* p1, T* p2, T* p3) {
+  const int n = blockIdx.x;
+  const int g = blockIdx.y;
+  const int ng = n * groups + g;
+  T sum1 = 0;
+  T sum2 = 0;
+  T var_inv = rsqrt(var[ng] + epsilon);
+  for (int64_t i = threadIdx.x; i < group_size; i += blockDim.x) {
+    const int64_t index = ng * group_size + i;
+    const int64_t c = g * group_size + i;
+    const T scale_v = scale == nullptr ? T(1) : static_cast<T>(scale[c]);
+    sum1 += ds[index] * scale_v;
+    sum2 += db[index] * scale_v;
+    const T scale_c = scale == nullptr ? T(0) : static_cast<T>(scale[c]);
+    p1[index] = scale_c * var_inv;
+  }
+
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+  sum1 = BlockReduce(ds_storage).Reduce(sum1, cub::Sum());
+  sum2 = BlockReduce(db_storage).Reduce(sum2, cub::Sum());
+
+  if (threadIdx.x == 0) {
+    const T s = T(1) / static_cast<T>(group_size * imsize);
+    const T x = (sum2 * static_cast<T>(mean[ng]) - sum1) *
+                static_cast<T>(var_inv) * static_cast<T>(var_inv) *
+                static_cast<T>(var_inv) * s;
+    p2[ng] = x;
+    p3[ng] = -x * static_cast<T>(mean[ng]) - sum2 * static_cast<T>(var_inv) * s;
+  }
+}
+
+template <typename T>
+__global__ void GetXGradientCUDAKernel(int imsize, int C, int group_size,
+                                       int groups, T* p1, T* p2, T* p3,
+                                       const T* x, const T* dy, T* dx) {
+  int cid = blockIdx.x;
+  int gid = blockIdx.y;
+  int bid = blockIdx.z;
+  int ccid = bid * C + gid * group_size + cid;
+  int ng = bid * groups + gid;
+  int nc = gid * group_size + cid;
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    int index = (bid * C + nc) * imsize + imid;
+    dx[index] = p1[ccid] * dy[index] + p2[ng] * x[index] + p3[ng];
+  }
+}
+
 template <typename T>
 class GroupNormGradKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -408,7 +552,9 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
     const float epsilon = ctx.Attr<float>("epsilon");
-    auto* x = ctx.Input<Tensor>("Y");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* mean = ctx.Input<Tensor>("Mean");
     auto* var = ctx.Input<Tensor>("Variance");
     auto* scale = ctx.Input<Tensor>("Scale");
     auto* bias = ctx.Input<Tensor>("Bias");
@@ -433,31 +579,27 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
     phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
 
-    Tensor temp_var;
-    temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
-    set_zero(dev_ctx, &temp_var, static_cast<T>(0));
-    T* temp_var_data = temp_var.data<T>();
-
-    Tensor temp_mean;
-    temp_mean.mutable_data<T>(var->dims(), ctx.GetPlace());
-    set_zero(dev_ctx, &temp_mean, static_cast<T>(0));
-    T* temp_mean_data = temp_mean.data<T>();
+    Tensor ds, db;
+    ds.mutable_data<T>({x_dims[0], C}, ctx.GetPlace());
+    db.mutable_data<T>({x_dims[0], C}, ctx.GetPlace());
+    T* ds_data = ds.data<T>();
+    T* db_data = db.data<T>();
 
+    auto* y_data = y->data<T>();
     auto* x_data = x->data<T>();
     T* d_x_data = nullptr;
     if (d_x) d_x_data = d_x->data<T>();
-    auto* y_data = d_y->data<T>();
+    auto* dy_data = d_y->data<T>();
     auto* var_data = var->data<T>();
+    auto* mean_data = mean->data<T>();
     T* d_scale_data = nullptr;
     if (d_scale) {
       d_scale->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_scale, static_cast<T>(0));
       d_scale_data = d_scale->data<T>();
     }
     T* d_bias_data = nullptr;
     if (d_bias) {
       d_bias->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_bias, static_cast<T>(0));
       d_bias_data = d_bias->data<T>();
     }
 
@@ -479,22 +621,103 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
 
 #ifdef __HIPCC__
     int block_size = std::max(std::min(256, imsize), 64);
+    const int block_dims = 256;
 #else
     int block_size = std::min(1024, imsize);
+    const int block_dims = 1024;
 #endif
     dim3 grid(group_size, groups, x_dims[0]);
     dim3 threads(block_size, 1, 1);
     int flags =
         (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
-    UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, x_data, scale_data,
-                     bias_data, y_data, x_dims[0], C, W, imsize, groups,
-                     group_size, epsilon, temp_mean_data, temp_var_data,
-                     d_scale_data, d_bias_data, data_layout);
-    if (d_x_data != nullptr) {
-      UNROLL_ALL_CASES(flags, GroupNormBackward, x_data, y_data, scale_data,
-                       bias_data, var_data, temp_mean_data, temp_var_data,
-                       x_dims[0], C, W, imsize, groups, group_size, epsilon,
-                       d_x_data, data_layout);
+    if (data_layout == DataLayout::kNCHW) {
+      using AccT = typename details::MPTypeTrait<T>::Type;
+      constexpr int vec_size = sizeof(float4) / sizeof(T);
+      const int max_num_threads = 1024;
+      int max_block_size = std::min(imsize / vec_size, max_num_threads);
+      int block_size_nchw = 1;
+      while (block_size_nchw < max_block_size) {
+        block_size_nchw *= 2;
+      }
+      block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
+      dim3 blocks(block_size_nchw);
+      if (imsize < vec_size) {
+        if (d_scale) {
+          set_zero(dev_ctx, d_scale, static_cast<T>(0));
+        }
+        if (d_bias) {
+          set_zero(dev_ctx, d_bias, static_cast<T>(0));
+        }
+        ScalarGetDsDbCUDAKernel<
+            T><<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
+            imsize, x_data, dy_data, ds_data, db_data);
+      } else {
+        VectorizedGetDsDbCUDAKernel<
+            T, AccT, vec_size><<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
+            imsize, x_data, dy_data, ds_data, db_data);
+      }
+
+      if (d_scale || d_bias) {
+        const int block = 256;
+        GetScaleBiasGradientCUDAKernel<
+            T><<<(C + block - 1) / block, block, 0, dev_ctx.stream()>>>(
+            x_dims[0], C, groups, epsilon, mean_data, var_data, ds_data,
+            db_data, d_scale_data, d_bias_data);
+      }
+
+      if (d_x_data != nullptr) {
+        // p1 * dy + p2 * x + p3,
+        // p1, p2, p3 represent the reverse calculation of temporary variables
+        // p1 = scale * var_inv
+        // p2 = (db * scale * mean - ds * scale) * pow(var_inv, 3) * (1/n)
+        // p3 = -p2 * mean[ng] - db * scale * var_inv * (1/n);
+        Tensor p1, p2, p3;
+        p1.mutable_data<T>({x_dims[0] * C}, ctx.GetPlace());
+        p2.mutable_data<T>({x_dims[0], groups}, ctx.GetPlace());
+        p3.mutable_data<T>({x_dims[0], groups}, ctx.GetPlace());
+        T* p1_data = p1.data<T>();
+        T* p2_data = p2.data<T>();
+        T* p3_data = p3.data<T>();
+
+        GetBackwardParamsCUDAKernel<T, block_dims><<<
+            dim3(x_dims[0], groups), block_dims, 0, dev_ctx.stream()>>>(
+            imsize, groups, group_size, epsilon, mean_data, var_data,
+            scale_data, ds_data, db_data, p1_data, p2_data, p3_data);
+        GetXGradientCUDAKernel<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+            imsize, C, group_size, groups, p1_data, p2_data, p3_data, x_data,
+            dy_data, d_x_data);
+      }
+
+    } else {
+      if (d_scale) {
+        set_zero(dev_ctx, d_scale, static_cast<T>(0));
+      }
+      if (d_bias) {
+        set_zero(dev_ctx, d_bias, static_cast<T>(0));
+      }
+
+      Tensor temp_var;
+      temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
+      set_zero(dev_ctx, &temp_var, static_cast<T>(0));
+      T* temp_var_data = temp_var.data<T>();
+
+      Tensor temp_mean;
+      temp_mean.mutable_data<T>(var->dims(), ctx.GetPlace());
+      set_zero(dev_ctx, &temp_mean, static_cast<T>(0));
+      T* temp_mean_data = temp_mean.data<T>();
+
+      int flags = (scale_data != nullptr) * kHasScale +
+                  (bias_data != nullptr) * kHasBias;
+      UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, y_data,
+                       scale_data, bias_data, dy_data, x_dims[0], C, W, imsize,
+                       groups, group_size, epsilon, temp_mean_data,
+                       temp_var_data, d_scale_data, d_bias_data);
+      if (d_x_data != nullptr) {
+        UNROLL_ALL_CASES(flags, GroupNormBackward, y_data, dy_data, scale_data,
+                         bias_data, var_data, temp_mean_data, temp_var_data,
+                         x_dims[0], C, W, imsize, groups, group_size, epsilon,
+                         d_x_data);
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
index 9575ab54b32bd..93f0d3d334f27 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/hierarchical_sigmoid_op.h"
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/multiary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -60,31 +64,6 @@ namespace operators {
 class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "hsigmoid");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "hsigmoid");
-    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "hsigmoid");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "hsigmoid");
-    OP_INOUT_CHECK(ctx->HasOutput("PreOut"), "Output", "PreOut", "hsigmoid");
-
-    auto with_prefetch = ctx->Attrs().Get<bool>("remote_prefetch");
-    if (with_prefetch) {
-      OP_INOUT_CHECK(ctx->HasOutput("W_Out"), "Output", "W_Out", "hsigmoid");
-    }
-    const int64_t input_dims = ctx->GetInputDim("X")[0];
-    const int64_t label_dims = ctx->GetInputDim("Label")[0];
-    PADDLE_ENFORCE_EQ(input_dims, label_dims,
-                      platform::errors::InvalidArgument(
-                          "The first dimension of "
-                          "input and label is expected to be the same. "
-                          "But received input's first dimension is %d; "
-                          "label's first dimension is %d.",
-                          input_dims, label_dims));
-
-    std::vector<int64_t> output_shape({input_dims, 1});
-    ctx->SetOutputDim("Out", phi::make_ddim(output_shape));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -272,22 +251,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    hierarchical_sigmoid, ops::HierarchicalSigmoidOp,
-    ops::HierarchicalSigmoidOpMaker<int>,
-    ops::HierarchicalSigmoidGradMaker<paddle::framework::OpDesc>,
-    ops::HierarchicalSigmoidGradMaker<paddle::imperative::OpBase>);
+DECLARE_INFER_SHAPE_FUNCTOR(hierarchical_sigmoid,
+                            HierarchicalSigmoidInferShapeFunctor,
+                            PD_INFER_META(phi::HierarchicalSigmoidInferMeta));
+REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp,
+                  ops::HierarchicalSigmoidOpMaker<int>,
+                  ops::HierarchicalSigmoidGradMaker<paddle::framework::OpDesc>,
+                  ops::HierarchicalSigmoidGradMaker<paddle::imperative::OpBase>,
+                  HierarchicalSigmoidInferShapeFunctor);
 REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp,
                   ops::HierarchicalSigmoidGradOpGradVarTypeInference,
                   ops::HierarchicalSigmoidGradOpNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(
-    hierarchical_sigmoid,
-    ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext,
-                                     double>);
-REGISTER_OP_CPU_KERNEL(
-    hierarchical_sigmoid_grad,
-    ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
-                                         float>,
-    ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
-                                         double>);
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
deleted file mode 100644
index f11b28cfefb07..0000000000000
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ /dev/null
@@ -1,222 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <iterator>
-#include <memory>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/clip_op.h"
-#include "paddle/fluid/operators/math/matrix_bit_code.h"
-#include "paddle/fluid/platform/transform.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-using platform::Transform;
-using framework::LoDTensor;
-
-static std::vector<int64_t> PathToRows(const LoDTensor& path) {
-  std::set<int64_t> rows;
-  const int64_t* paths = path.data<int64_t>();
-  for (int64_t i = 0; i < path.numel(); ++i) {
-    int64_t row = paths[i];
-    if (row < 0) {
-      continue;
-    }
-    rows.emplace(row);
-  }
-  return std::vector<int64_t>(rows.begin(), rows.end());
-}
-template <typename DeviceContext, typename T>
-class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& in = GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X",
-                               "HierarchicalSigmoid");
-    auto& w = GET_DATA_SAFELY(ctx.Input<LoDTensor>("W"), "Input", "W",
-                              "HierarchicalSigmoid");
-    auto* path = ctx.Input<LoDTensor>("PathTable");
-    auto* code = ctx.Input<LoDTensor>("PathCode");
-    auto& label = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Label"), "Input",
-                                  "Label", "HierarchicalSigmoid");
-    auto* bias = ctx.Input<LoDTensor>("Bias");
-    auto* out = ctx.Output<LoDTensor>("Out");
-    auto* pre_out = ctx.Output<LoDTensor>("PreOut");
-    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
-    // for remote prefetch
-
-    bool is_custom = false;
-    if (path) {
-      is_custom = true;
-    }
-    int64_t code_length =
-        path ? path->dims()[1] : math::FindLastSet(num_classes - 1);
-    int64_t batch_size = in.dims()[0];
-    LoDTensor sum;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto* pre_out_data = pre_out->mutable_data<T>(
-        phi::make_ddim({batch_size, code_length}), ctx.GetPlace());
-    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
-    // Not all class(leaf) nodes' path lengths equal code_length, thus init as
-    // 0s can avoid out of path's loss.
-    phi::funcs::SetConstant<DeviceContext, T> zero;
-    zero(dev_ctx, pre_out, static_cast<T>(0.0));
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    phi::funcs::RowwiseSum<DeviceContext, T> row_sum;
-
-    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
-    if (!is_custom) {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(
-          num_classes, label.template data<int64_t>()));
-    } else {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(
-          *path, *code, label.template data<int64_t>()));
-    }
-
-    std::vector<int64_t> sum_dims({batch_size, 1UL});
-    sum.mutable_data<T>(phi::make_ddim(sum_dims), ctx.GetPlace());
-    auto sum_mat = EigenMatrix<T>::From(sum);
-    out->mutable_data<T>(ctx.GetPlace());
-    auto out_mat = framework::EigenMatrix<T>::From(*out);
-    if (bias) {
-      bit_code->Add(*bias, pre_out);
-    }
-    bit_code->Mul(pre_out, w, in);
-    // clip to [-40, 40]
-    Transform<DeviceContext> trans;
-    trans(ctx.template device_context<DeviceContext>(), pre_out_data,
-          pre_out_data + pre_out->numel(), pre_out_data,
-          ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
-    bit_code->Sum(*pre_out, out, static_cast<T>(-1));
-    // use softrelu to calculate cross entropy
-    pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
-    row_sum(dev_ctx, *pre_out, &sum);
-    // TODO(guosheng): Subtract the out of path's loss, since not all
-    // class(leaf) nodes' path lengths equal code_length. But it won't break the
-    // gradient check since both have the out of path's loss and will cancel out
-    // each other.
-    out_mat.device(place) = sum_mat + out_mat;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& in = GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X",
-                               "HierarchicalSigmoidGrad");
-    auto& w = GET_DATA_SAFELY(ctx.Input<LoDTensor>("W"), "Input", "W",
-                              "HierarchicalSigmoidGrad");
-    auto* path = ctx.Input<LoDTensor>("PathTable");
-    auto* code = ctx.Input<LoDTensor>("PathCode");
-    auto* in_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-    bool is_sparse = ctx.Attr<bool>("is_sparse");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> zero;
-    auto& label = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Label"), "Input",
-                                  "Label", "HierarchicalSigmoidGrad");
-    auto& pre_out = GET_DATA_SAFELY(ctx.Input<LoDTensor>("PreOut"), "Input",
-                                    "PreOut", "HierarchicalSigmoidGrad");
-    auto& out_grad = GET_DATA_SAFELY(
-        ctx.Input<LoDTensor>(framework::GradVarName("Out")), "Input",
-        framework::GradVarName("Out"), "HierarchicalSigmoidGrad");
-    LoDTensor pre_out_grad;
-
-    pre_out_grad.mutable_data<T>(pre_out.dims(), ctx.GetPlace());
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    zero(dev_ctx, in_grad, static_cast<T>(0.0));
-
-    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
-
-    bool is_custom = false;
-    if (path) {
-      is_custom = true;
-    }
-
-    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
-    if (!is_custom) {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(
-          num_classes, label.template data<int64_t>()));
-    } else {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(
-          *path, *code, label.template data<int64_t>()));
-    }
-
-    // softrelu derivative
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-
-    auto* pre_out_grad_data = pre_out_grad.data<T>();
-    auto* pre_out_data = pre_out.template data<T>();
-    auto n = pre_out.numel();
-    blas.VEXP(n, pre_out_data, pre_out_grad_data);
-    blas.VINV(n, pre_out_grad_data, pre_out_grad_data);
-    for (int64_t i = 0; i < n; ++i) {
-      pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i];
-    }
-    bit_code->Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
-    auto* out_grad_data = out_grad.template data<T>();
-
-    int64_t dim0 = pre_out_grad.dims()[0];
-    int64_t dim1 = pre_out_grad.dims()[1];
-    for (int64_t i = 0; i < dim0; ++i) {
-      T tmp = out_grad_data[i];
-      blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1);
-    }
-    // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
-    // be consistent with the clipping in forward.
-    auto* bias_grad = ctx.Output<LoDTensor>(framework::GradVarName("Bias"));
-    if (bias_grad) {
-      bias_grad->mutable_data<T>(ctx.GetPlace());
-      zero(dev_ctx, bias_grad, static_cast<T>(0.0));
-      bit_code->AddGrad(pre_out_grad, bias_grad);
-    }
-    if (!is_sparse) {
-      auto* w_grad = ctx.Output<LoDTensor>(framework::GradVarName("W"));
-      w_grad->mutable_data<T>(ctx.GetPlace());
-      zero(dev_ctx, w_grad, static_cast<T>(0.0));
-      bit_code->MulGradWeight(pre_out_grad, w_grad, in);
-    } else {
-      PADDLE_ENFORCE_NOT_NULL(path,
-                              platform::errors::NotFound(
-                                  "Custom tree must be set for sparse mode!"));
-      framework::Vector<int64_t> real_rows = PathToRows(*path);
-      auto* w_grad = ctx.Output<phi::SelectedRows>(framework::GradVarName("W"));
-      w_grad->set_rows(real_rows);
-      // Build a map of id -> row_index to speed up finding the index of one id
-      w_grad->set_height(w.dims()[0]);
-      auto* w_grad_value = w_grad->mutable_value();
-      framework::DDim temp_dim(w.dims());
-      temp_dim[0] = real_rows.size();
-      w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace());
-      zero(dev_ctx, w_grad_value, static_cast<T>(0.0));
-      bit_code->MulGradWeight(pre_out_grad, w_grad, in);
-    }
-    bit_code->MulGradError(pre_out_grad, w, in_grad);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/histogram_op.cc b/paddle/fluid/operators/histogram_op.cc
index 92cc6077defcd..c9fd75651b589 100644
--- a/paddle/fluid/operators/histogram_op.cc
+++ b/paddle/fluid/operators/histogram_op.cc
@@ -16,7 +16,9 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -28,27 +30,6 @@ class HistogramOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "histogram");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "histogram");
-    const auto &nbins = ctx->Attrs().Get<int64_t>("bins");
-    const auto &minval = ctx->Attrs().Get<int>("min");
-    const auto &maxval = ctx->Attrs().Get<int>("max");
-
-    PADDLE_ENFORCE_GE(nbins, 1,
-                      platform::errors::InvalidArgument(
-                          "The bins should be greater than or equal to 1."
-                          "But received nbins is %d",
-                          nbins));
-    PADDLE_ENFORCE_GE(maxval, minval, platform::errors::InvalidArgument(
-                                          "max must be larger or equal to min."
-                                          "But received max is %d, min is %d",
-                                          maxval, minval));
-
-    ctx->SetOutputDim("Out", phi::make_ddim({nbins}));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const {
     auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
@@ -81,7 +62,12 @@ class HistogramOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(histogram, HistogramInferShapeFunctor,
+                            PD_INFER_META(phi::HistogramInferMeta));
+
 REGISTER_OPERATOR(
     histogram, ops::HistogramOp, ops::HistogramOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    HistogramInferShapeFunctor);
diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc
index 09f4e63943ad3..8324a6215bca8 100644
--- a/paddle/fluid/operators/increment_op_npu_test.cc
+++ b/paddle/fluid/operators/increment_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/index_select_op.cc b/paddle/fluid/operators/index_select_op.cc
index fea71edf41313..069cc9416a620 100644
--- a/paddle/fluid/operators/index_select_op.cc
+++ b/paddle/fluid/operators/index_select_op.cc
@@ -13,8 +13,13 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/index_select_op.h"
+
 #include <memory>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -24,52 +29,6 @@ class IndexSelectOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of IndexSelectOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) of IndexSelectOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of IndexSelectOp should not be null."));
-
-    auto input_dim = ctx->GetInputDim("X");
-    auto index_dim = ctx->GetInputDim("Index");
-    auto dim = ctx->Attrs().Get<int>("dim");
-
-    PADDLE_ENFORCE_EQ(
-        dim < input_dim.size() && dim >= (0 - input_dim.size()), true,
-        platform::errors::OutOfRange(
-            "Attr(dim) is out of range, It's expected "
-            "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
-            input_dim.size(), input_dim.size() - 1, dim));
-
-    PADDLE_ENFORCE_EQ(
-        index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1),
-        true, platform::errors::InvalidArgument(
-                  "The 'shape' of Input(Index) must be 1-D tensor. "
-                  "But received: the 'shape' of Input(Index) is [%s], "
-                  "the dimension of Input(Index) is [%d].",
-                  index_dim, index_dim.size()));
-
-    PADDLE_ENFORCE_EQ(index_dim[0] != 0, true,
-                      platform::errors::InvalidArgument(
-                          "The length of Input(Index) can't be 0."));
-
-    auto output_dim = phi::vectorize(input_dim);
-    if (dim < 0) {
-      dim += input_dim.size();
-    }
-    output_dim[dim] = index_dim[0];
-    ctx->SetOutputDim("Out", phi::make_ddim(output_dim));
-    auto type = ctx->GetInputsVarType("X")[0];
-    if (type == framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -148,20 +107,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSelectGradNoNeedBufferVarsInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(index_select, IndexSelectInferShapeFunctor,
+                            PD_INFER_META(phi::IndexSelectInferMeta));
 REGISTER_OPERATOR(index_select, ops::IndexSelectOp, ops::IndexSelectOpMaker,
                   ops::IndexSelectGradMaker<paddle::framework::OpDesc>,
-                  ops::IndexSelectGradMaker<paddle::imperative::OpBase>);
+                  ops::IndexSelectGradMaker<paddle::imperative::OpBase>,
+                  IndexSelectInferShapeFunctor);
 REGISTER_OPERATOR(index_select_grad, ops::IndexSelectGradOp,
                   ops::IndexSelectGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    index_select,
-    ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    index_select_grad,
-    ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/index_select_op.cu b/paddle/fluid/operators/index_select_op.cu
deleted file mode 100644
index f810aee2adea5..0000000000000
--- a/paddle/fluid/operators/index_select_op.cu
+++ /dev/null
@@ -1,209 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/index_select_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T, typename IndexT>
-__global__ void index_select_cuda_kernel(const T* input, T* output,
-                                         const IndexT* index, int64_t N,
-                                         int64_t stride, int64_t size,
-                                         int64_t delta) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-
-  int64_t pre_idx = idx / (stride * size);
-  int64_t dim_idx = idx % (stride * size) / stride;
-  IndexT src_dim_idx = index[dim_idx];
-  int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
-  output[idx] = input[input_idx];
-}
-
-template <typename T, typename IndexT>
-__global__ void index_select_grad_cuda_kernel(const T* output_grad,
-                                              T* input_grad,
-                                              const IndexT* index, int64_t nums,
-                                              int64_t N, int64_t stride,
-                                              int64_t size, int64_t delta) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-
-  int64_t pre_idx = idx / (stride * size);
-  int64_t dim_idx = idx % (stride * size) / stride;
-  IndexT src_dim_idx = index[dim_idx];
-  int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
-  paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]);
-}
-
-template <typename T>
-__global__ void index_select_grad_init(T* input_grad, int64_t N) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-  input_grad[idx] = 0.0;
-}
-
-template <typename DeviceContext, typename T>
-class IndexSelectCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* index = context.Input<LoDTensor>("Index");
-    auto* out = context.Output<LoDTensor>("Out");
-    int dim = context.Attr<int>("dim");
-    auto input_dim = in->dims();
-    auto output_dim = out->dims();
-    dim = dim >= 0 ? dim : dim + input_dim.size();
-    auto stride_dim = phi::stride(input_dim);
-    int64_t stride = stride_dim[dim];
-    int64_t size = output_dim[dim];
-    int64_t delta = input_dim[dim] - size;
-
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT64 ||
-                            index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = out->numel();
-
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-
-    if (index_type == framework::proto::VarType::INT64) {
-      const int64_t* index_data = index->data<int64_t>();
-      index_select_cuda_kernel<T, int64_t><<<
-          (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, index_data,
-                                                numel, stride, size, delta);
-      platform::GpuStreamSync(stream);
-    } else {
-      const int* index_data = index->data<int>();
-      index_select_cuda_kernel<T, int><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                                             PADDLE_CUDA_NUM_THREADS,
-                                         PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-          in_data, out_data, index_data, numel, stride, size, delta);
-      platform::GpuStreamSync(stream);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class IndexSelectGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* output_grad = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* in_grad = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* index = context.Input<LoDTensor>("Index");
-
-    auto* output_grad_data = output_grad->data<T>();
-    auto* in_grad_data = in_grad->mutable_data<T>(context.GetPlace());
-
-    int dim = context.Attr<int>("dim");
-    auto input_dim = in_grad->dims();
-    auto output_dim = output_grad->dims();
-    dim = dim >= 0 ? dim : dim + input_dim.size();
-    auto stride_dim = phi::stride(input_dim);
-    int64_t stride = stride_dim[dim];
-    int64_t size = output_dim[dim];
-    int64_t delta = input_dim[dim] - size;
-
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT64 ||
-                            index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    int64_t numel = in_grad->numel();
-    int64_t index_nums = index->numel();
-    int64_t out_nums = output_grad->numel();
-
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-
-    index_select_grad_init<
-        T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_grad_data, numel);
-
-    if (index_type == framework::proto::VarType::INT64) {
-      const int64_t* index_data = index->data<int64_t>();
-      index_select_grad_cuda_kernel<T, int64_t><<<
-          (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
-                                                index_data, index_nums,
-                                                out_nums, stride, size, delta);
-      platform::GpuStreamSync(stream);
-    } else {
-      const int* index_data = index->data<int>();
-      index_select_grad_cuda_kernel<T, int><<<
-          (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
-                                                index_data, index_nums,
-                                                out_nums, stride, size, delta);
-      platform::GpuStreamSync(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    index_select,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext,
-                               paddle::platform::float16>,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    index_select_grad,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                   paddle::platform::float16>,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>);
diff --git a/paddle/fluid/operators/index_select_op.h b/paddle/fluid/operators/index_select_op.h
index 04b4f69add785..684829be2697c 100644
--- a/paddle/fluid/operators/index_select_op.h
+++ b/paddle/fluid/operators/index_select_op.h
@@ -91,41 +91,6 @@ void IndexSelectInner(const framework::ExecutionContext& context,
   output->Resize(output_dim);
 }
 
-template <typename DeviceContext, typename T>
-class IndexSelectKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto inputs = *context.Input<framework::LoDTensor>("X");
-    auto* index = context.Input<framework::LoDTensor>("Index");
-    auto* output = context.Output<framework::LoDTensor>("Out");
-
-    int dim = context.Attr<int>("dim");
-    if (dim < 0) {
-      dim += inputs.dims().size();
-    }
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSelectInner<DeviceContext, T, int>(context, &inputs, *index, output,
-                                              dim);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      IndexSelectInner<DeviceContext, T, int64_t>(context, &inputs, *index,
-                                                  output, dim);
-    }
-  }
-};
-
 template <typename DeviceContext, typename T, class Enable = void>
 struct IndexSelectAdd {
   void operator()(const framework::ExecutionContext& ctx, int slice_size,
@@ -197,43 +162,5 @@ void IndexSelectGradInner(const framework::ExecutionContext& context,
   x_grad->Resize(output_dim);
 }
 
-template <typename DeviceContext, typename T>
-class IndexSelectGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x_grad =
-        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto* index = context.Input<framework::LoDTensor>("Index");
-    auto* out_grad =
-        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-
-    int dim = context.Attr<int>("dim");
-    if (dim < 0) {
-      dim += out_grad->dims().size();
-    }
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSelectGradInner<DeviceContext, T, int>(context, *out_grad, *index,
-                                                  x_grad, dim);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      IndexSelectGradInner<DeviceContext, T, int64_t>(context, *out_grad,
-                                                      *index, x_grad, dim);
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/index_select_op_npu.cc b/paddle/fluid/operators/index_select_op_npu.cc
index bce7a3c1caae3..a232fba7e28d6 100644
--- a/paddle/fluid/operators/index_select_op_npu.cc
+++ b/paddle/fluid/operators/index_select_op_npu.cc
@@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/index_select_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename DeviceContext, typename T>
 class IndexSelectNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index 7f5136969980b..77951ff394e74 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -323,6 +323,7 @@ class InplaceABNGradKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
 REGISTER_OPERATOR(inplace_abn, ops::InplaceABNOp, ops::InplaceABNOpMaker,
                   ops::BatchNormOpInferVarType,
                   ops::InplaceABNOpGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index d61eb46d97e98..cd297c53f89a0 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -61,13 +61,13 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D(
 
 template <typename T>
 __forceinline__ __device__ void PreCalculatorForLinearInterpInputIndex(
-    int* in_img_idx, int* w_id, T* w1lambda, T* w2lambda, T src_w,
-    const int in_img_w) {
-  src_w = (src_w > 0) ? src_w : 0.f;
-  *in_img_idx = static_cast<int>(src_w);
-  *w_id = (*in_img_idx < in_img_w - 1) ? 1 : 0;
-  *w1lambda = src_w - *in_img_idx;
-  *w2lambda = 1.f - *w1lambda;
+    int* in_img_idx, int* x_id, T* lambda1, T* lambda2, T src_x,
+    const int in_img_x) {
+  src_x = (src_x > 0) ? src_x : 0.f;
+  *in_img_idx = static_cast<int>(src_x);
+  *x_id = (*in_img_idx < in_img_x - 1) ? 1 : 0;
+  *lambda1 = src_x - *in_img_idx;
+  *lambda2 = 1.f - *lambda1;
 }
 
 struct FastDivModForInterpolate {
@@ -670,83 +670,102 @@ __global__ void KeBilinearInterpBwShareMemory(
   }
 }
 
+__device__ __forceinline__ int GetInputIndex(const size_t nc, const int height,
+                                             const int width, const int h,
+                                             const int w) {
+  return (nc * height + h) * width + w;
+}
+
+template <typename T>
+__global__ void KeBilinearInterpNCHWBw(T* in, const int in_h, const int in_w,
+                                       const int out_h, const int out_w,
+                                       const int n, const int num_channels,
+                                       float ratio_h, float ratio_w,
+                                       const T* __restrict__ out,
+                                       const T align_type_value) {
+  int index = threadIdx.x + blockDim.x * blockIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int num_out = n * num_channels * out_h * out_w;
+  int num_in = n * num_channels * in_h * in_w;
+
+  for (; index < num_out; index += stride) {
+    int index_tmp = index;
+    int w2 = index_tmp % out_w;
+    index_tmp /= out_w;
+    int h2 = index_tmp % out_h;
+    int nc = index_tmp / out_h;
+
+    int h1, y_id;
+    T h1lambda, h0lambda;
+    T src_y = ratio_h * (h2 + align_type_value) - align_type_value;
+
+    PreCalculatorForLinearInterpInputIndex(&h1, &y_id, &h1lambda, &h0lambda,
+                                           src_y, in_h);
+    int w1, x_id;
+    T w1lambda, w0lambda;
+    T src_x = ratio_w * (w2 + align_type_value) - align_type_value;
+    PreCalculatorForLinearInterpInputIndex(&w1, &x_id, &w1lambda, &w0lambda,
+                                           src_x, in_w);
+
+    T d2val = out[index];
+
+    platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1),
+                            h0lambda * w0lambda * d2val);
+    platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1 + x_id),
+                            h0lambda * w1lambda * d2val);
+    platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1),
+                            h1lambda * w0lambda * d2val);
+    platform::CudaAtomicAdd(
+        in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1 + x_id),
+        h1lambda * w1lambda * d2val);
+  }
+}
+
 template <typename T>
 __global__ void KeBilinearInterpBw(T* in, const int in_h, const int in_w,
                                    const T* __restrict__ out, const int out_h,
                                    const int out_w, const int n,
-                                   const int num_channels, float ratio_h,
-                                   float ratio_w, const T align_type_value,
-                                   bool is_nchw) {
+                                   const int out_chw, const int num_channels,
+                                   float ratio_h, float ratio_w,
+                                   const T align_type_value,
+                                   FastDivModForInterpolate divmods) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
   int in_chw = in_h * in_w * num_channels;
-  int out_chw = num_channels * out_h * out_w;
   int nthreads = n * out_chw;
 
-  if (is_nchw) {
-    for (; tid < nthreads; tid += stride) {
-      int out_id_h = tid / out_chw;
-      int out_id_w = tid % out_chw;
-      const int in_img_size = in_h * in_w;
-      const int out_img_size = out_h * out_w;
-      T value = out[out_id_h * out_chw + out_id_w];
-
-      int channel_id = out_id_w / out_img_size;
-      int out_img_idy = (out_id_w % out_img_size) / out_w;
-      int out_img_idx = tid % out_w;
-      int in_img_idx, in_img_idy, w_id, h_id;
-      T w1lambda, h1lambda, w2lambda, h2lambda;
-
-      T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
-      T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
-
-      PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda,
-                                             &w2lambda, src_w, in_w);
-      PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda,
-                                             &h2lambda, src_h, in_h);
-
-      T* in_pos = &in[out_id_h * in_chw + channel_id * in_img_size +
-                      in_img_idy * in_w + in_img_idx];
-      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value);
-      platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * value);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_w],
-                              h1lambda * w2lambda * value);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_w + w_id],
-                              h1lambda * w1lambda * value);
-    }
-  } else {
-    for (; tid < nthreads; tid += stride) {
-      int out_id_h = tid / out_chw;
-      int out_id_w = tid % out_chw;
-      const int in_img_size = in_h * in_w;
-      const int out_img_size = out_h * out_w;
-      T value = out[out_id_h * out_chw + out_id_w];
-
-      int out_img_idy = out_id_w / (out_w * num_channels);
-      int out_img_idx = out_id_w % (out_w * num_channels) / num_channels;
-      int channel_id = tid % num_channels;
-
-      int in_img_idx, in_img_idy, w_id, h_id;
-      T w1lambda, h1lambda, w2lambda, h2lambda;
-      T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
-      T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
-
-      PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda,
-                                             &w2lambda, src_w, in_w);
-      PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda,
-                                             &h2lambda, src_h, in_h);
-
-      T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels +
-                      in_img_idx * num_channels + channel_id];
-      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value);
-      platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
-                              h2lambda * w1lambda * value);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels],
-                              h1lambda * w2lambda * value);
-      platform::CudaAtomicAdd(
-          &in_pos[h_id * in_w * num_channels + w_id * num_channels],
-          h1lambda * w1lambda * value);
-    }
+  for (; tid < nthreads; tid += stride) {
+    auto out_id_divmod = divmods.output_w_div.Divmod(tid);
+    int out_id_h = out_id_divmod.val[0];
+    int out_id_w = out_id_divmod.val[1];
+
+    int channel_id = divmods.channels_div.Divmod(tid).val[1];
+    auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w);
+    int out_img_idy = outimg_id_divmod.val[0];
+    int out_img_idx =
+        divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0];
+
+    int in_img_idx, in_img_idy, w_id, h_id;
+    T w1lambda, h1lambda, w2lambda, h2lambda;
+    T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
+    T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
+
+    PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda,
+                                           &w2lambda, src_w, in_w);
+    PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda,
+                                           &h2lambda, src_h, in_h);
+
+    T value = out[tid];
+    T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels +
+                    in_img_idx * num_channels + channel_id];
+    platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value);
+    platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
+                            h2lambda * w1lambda * value);
+    platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels],
+                            h1lambda * w2lambda * value);
+    platform::CudaAtomicAdd(
+        &in_pos[h_id * in_w * num_channels + w_id * num_channels],
+        h1lambda * w1lambda * value);
   }
 }
 
@@ -1907,11 +1926,23 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
                ctx.cuda_device_context().stream()>>>(
           input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c,
           ratio_h, ratio_w, align_type_value, is_nchw);
+    } else if (!optimize_flag & is_nchw) {
+      //
+      const int num_kernels = n * c * out_h * out_w;
+      const int num_threads =
+          std::min(ctx.cuda_device_context().GetMaxThreadsPerBlock(), 1024);
+      KeBilinearInterpNCHWBw<
+          T><<<platform::DivUp(num_kernels, num_threads), num_threads, 0,
+               ctx.cuda_device_context().stream()>>>(
+          input_grad_data, in_h, in_w, out_h, out_w, n, c, ratio_h, ratio_w,
+          output_grad_data, align_type_value);
     } else {
+      int64_t cw = c * out_w;
+      auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw);
       KeBilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
                               ctx.cuda_device_context().stream()>>>(
-          input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c,
-          ratio_h, ratio_w, align_type_value, is_nchw);
+          input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n,
+          out_chw, c, ratio_h, ratio_w, align_type_value, interp_divmods);
     }
   } else if ("bicubic" == interp_method) {
 #ifdef __HIPCC__
diff --git a/paddle/fluid/operators/isclose_op.cc b/paddle/fluid/operators/isclose_op.cc
index 0ae7a9fa02f1f..1c79213757fdf 100644
--- a/paddle/fluid/operators/isclose_op.cc
+++ b/paddle/fluid/operators/isclose_op.cc
@@ -12,56 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/isclose_op.h"
 #include <cmath>
 #include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct GetTensorValue<platform::CPUDeviceContext, T> {
-  T operator()(const platform::CPUDeviceContext& dev_ctx,
-               const framework::Tensor& tensor) const {
-    return *(tensor.data<T>());
-  }
-};
-
-template <typename T>
-struct IscloseFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& other,
-                  const double rtol, const double atol, bool equal_nan,
-                  framework::Tensor* output) {
-    auto* in_a = in.data<T>();
-    auto* in_b = other.data<T>();
-    auto* out_data = output->mutable_data<bool>(ctx.GetPlace());
-    auto num = in.numel();
-    // *out_data = true;
-    for (int i = 0; i < num; i++) {
-      out_data[i] = true;
-    }
-    for (int i = 0; i < num; i++) {
-      const T a = in_a[i], b = in_b[i];
-      bool val;
-      if (std::isnan(a) || std::isnan(b)) {
-        val = equal_nan && std::isnan(a) == std::isnan(b);
-      } else {
-        T left = (a > b ? a - b : b - a);
-        T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
-        T diff = (left > right ? left - right : right - left);
-        val = a == b || left <= right || diff <= 1e-15;
-      }
-      // *out_data &= val;
-      out_data[i] = val;
-    }
-  }
-};
-
 class IscloseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -100,40 +63,6 @@ class IscloseOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Isclose");
-    OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Isclose");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Isclose");
-
-    auto input_dim = ctx->GetInputDim("Input");
-    auto other_dim = ctx->GetInputDim("Other");
-    PADDLE_ENFORCE_EQ(input_dim.size(), other_dim.size(),
-                      platform::errors::PreconditionNotMet(
-                          "Input(Input) and Input(Other) must have the same "
-                          "dimension size."));
-    int n = input_dim.size();
-    bool is_runtime = ctx->IsRuntime();
-    for (int i = 0; i < n; i++) {
-      if (is_runtime) {
-        PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
-                          platform::errors::PreconditionNotMet(
-                              "The value at dim %d of Input(Input) is not "
-                              "equal to the Input(Other): %ld != %ld.",
-                              i, input_dim[i], other_dim[i]));
-      } else {
-        if (!(input_dim[i] < 0 || other_dim[i] < 0)) {
-          PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
-                            platform::errors::PreconditionNotMet(
-                                "The value at dim %d of Input(Input) is not "
-                                "equal to the Input(Other): %ld != %ld.",
-                                i, input_dim[i], other_dim[i]));
-        }
-      }
-    }
-
-    ctx->SetOutputDim("Out", input_dim);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -154,12 +83,11 @@ class IscloseOpVarTypeInference : public framework::VarTypeInference {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
 
+DECLARE_INFER_SHAPE_FUNCTOR(isclose, IscloseInferShapeFunctor,
+                            PD_INFER_META(phi::ValueCompareInferMeta));
 REGISTER_OPERATOR(
     isclose, ops::IscloseOp, ops::IscloseOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::IscloseOpVarTypeInference);
-REGISTER_OP_CPU_KERNEL(isclose, ops::IscloseKernel<CPU, float>,
-                       ops::IscloseKernel<CPU, double>);
+    ops::IscloseOpVarTypeInference, IscloseInferShapeFunctor);
diff --git a/paddle/fluid/operators/isclose_op.cu b/paddle/fluid/operators/isclose_op.cu
deleted file mode 100644
index 09710ba0c6957..0000000000000
--- a/paddle/fluid/operators/isclose_op.cu
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/isclose_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct GetTensorValue<platform::CUDADeviceContext, T> {
-  T operator()(const platform::CUDADeviceContext& dev_ctx,
-               const framework::Tensor& tensor) const {
-    const T* data = tensor.data<T>();
-    T value;
-    const auto gpu_place = dev_ctx.GetPlace();
-    memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T),
-                 dev_ctx.stream());
-    return value;
-  }
-};
-
-template <typename T>
-__global__ void IscloseCUDAKernel(const T* in_data, const T* other_data,
-                                  const double rtol, const double atol,
-                                  bool equal_nan, int num, bool* out_data) {
-  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  bool val;
-  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
-    const T a = in_data[i], b = other_data[i];
-    if (isnan(a) || isnan(b)) {
-      val = equal_nan && isnan(a) == isnan(b);
-    } else {
-      T left = (a > b ? a - b : b - a);
-      T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
-      T diff = (left > right ? left - right : right - left);
-      val = a == b || left <= right || diff <= 1e-15;
-    }
-    out_data[i] = val;
-    // if (!val) *out_data = false;
-  }
-}
-
-template <typename T>
-struct IscloseFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const framework::Tensor& in, const framework::Tensor& other,
-                  const double rtol, const double atol, bool equal_nan,
-                  framework::Tensor* output) {
-    int num = in.numel();
-    const T* in_data = in.data<T>();
-    const T* other_data = other.data<T>();
-    bool* out_data = output->mutable_data<bool>(dev_ctx.GetPlace());
-    int block = 1024;
-    int grid = (block - 1 + num) / block;
-    grid = (grid > block) ? block : grid;
-#ifdef PADDLE_WITH_HIP
-    hipMemset(out_data, true, num * sizeof(bool));
-#else
-    cudaMemset(out_data, true, num * sizeof(bool));
-#endif
-    IscloseCUDAKernel<T><<<grid, block, 0, dev_ctx.stream()>>>(
-        in_data, other_data, rtol, atol, equal_nan, num, out_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(isclose, ops::IscloseKernel<CUDA, float>,
-                        ops::IscloseKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/isclose_op.h b/paddle/fluid/operators/isclose_op.h
deleted file mode 100644
index cde5d2afbf009..0000000000000
--- a/paddle/fluid/operators/isclose_op.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-struct GetTensorValue {
-  T operator()(const platform::DeviceContext& ctx,
-               const framework::Tensor& tensor) const;
-};
-
-template <typename DeviceContext, typename T>
-struct IscloseFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
-                  const framework::Tensor& other, const float rtol,
-                  const float atol, bool equal_nan, framework::Tensor* output);
-};
-
-template <typename DeviceContext, typename T>
-class IscloseKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // get attrs
-    bool equal_nan = ctx.Attr<bool>("equal_nan");
-    // get input/output
-    const auto* input = ctx.Input<Tensor>("Input");
-    const auto* other = ctx.Input<Tensor>("Other");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    double rtol_v = std::stod(ctx.Attr<std::string>("rtol"));
-    double atol_v = std::stod(ctx.Attr<std::string>("atol"));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    GetTensorValue<DeviceContext, double> get_tensor_value;
-    if (ctx.HasInput("Rtol")) {
-      const auto* rtol = ctx.Input<Tensor>("Rtol");
-      PADDLE_ENFORCE_EQ(
-          rtol->numel(), 1,
-          platform::errors::InvalidArgument(
-              "Input(Rtol) size must be 1, but get %d.", rtol->numel()));
-      PADDLE_ENFORCE_EQ(
-          framework::TransToProtoVarType(rtol->dtype()),
-          framework::proto::VarType::FP64,
-          platform::errors::InvalidArgument(
-              "Input(Rtol) type must be double, but get %s.",
-              framework::DataTypeToString(
-                  framework::TransToProtoVarType(rtol->dtype()))));
-      rtol_v = get_tensor_value(dev_ctx, *rtol);
-    }
-    if (ctx.HasInput("Atol")) {
-      const auto* atol = ctx.Input<Tensor>("Atol");
-      PADDLE_ENFORCE_EQ(
-          atol->numel(), 1,
-          platform::errors::InvalidArgument(
-              "Input(Atol) size must be 1, but get %d", atol->numel()));
-      PADDLE_ENFORCE_EQ(
-          framework::TransToProtoVarType(atol->dtype()),
-          framework::proto::VarType::FP64,
-          platform::errors::InvalidArgument(
-              "Input(Atol) type must be double, but get %s",
-              framework::DataTypeToString(
-                  framework::TransToProtoVarType(atol->dtype()))));
-      atol_v = get_tensor_value(dev_ctx, *atol);
-    }
-
-    IscloseFunctor<DeviceContext, T>()(dev_ctx, *input, *other, rtol_v, atol_v,
-                                       equal_nan, out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc
index a78d8ec10149d..67c1942ea0b41 100644
--- a/paddle/fluid/operators/kldiv_loss_op.cc
+++ b/paddle/fluid/operators/kldiv_loss_op.cc
@@ -9,10 +9,11 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/fluid/operators/kldiv_loss_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,44 +23,6 @@ using framework::Tensor;
 class KLDivLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "KLDivLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Target"), "Input", "Target", "KLDivLoss");
-    OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss", "KLDivLoss");
-
-    auto dim_x = ctx->GetInputDim("X");
-    auto dim_target = ctx->GetInputDim("Target");
-    PADDLE_ENFORCE_EQ(dim_x.size(), dim_target.size(),
-                      platform::errors::InvalidArgument(
-                          "Input(X) rank and Input(Target) rank should be "
-                          "same, but received X rank(%d) != Target rank(%d)",
-                          dim_x.size(), dim_target.size()));
-    for (int i = 0; i < dim_x.size(); i++) {
-      if (ctx->IsRuntime() || (dim_x[i] > 0 && dim_target[i] > 0)) {
-        PADDLE_ENFORCE_EQ(
-            dim_x[i], dim_target[i],
-            platform::errors::InvalidArgument(
-                "Input(X) and Input(Target) should in same shape. but received "
-                "X dimension[%d](%d) != Target dimension[%d](%d)",
-                i, dim_x[i], i, dim_target[i]));
-      }
-    }
-
-    auto reduction = ctx->Attrs().Get<std::string>("reduction");
-
-    auto reduction_valid = "mean" == reduction || "sum" == reduction ||
-                           "batchmean" == reduction || "none" == reduction;
-    PADDLE_ENFORCE_EQ(
-        reduction_valid, true,
-        platform::errors::InvalidArgument(
-            "Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'."));
-
-    if ("none" == reduction) {
-      ctx->SetOutputDim("Loss", dim_x);
-    } else {
-      ctx->SetOutputDim("Loss", {1});
-    }
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -172,15 +135,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(KLDivLossGradNoNeedBufferVarInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(kldiv_loss, KLDivInferShapeFunctor,
+                            PD_INFER_META(phi::KLDivInferMeta));
+
 REGISTER_OPERATOR(kldiv_loss, ops::KLDivLossOp, ops::KLDivLossOpMaker,
                   ops::KLDivLossOpGradMaker<paddle::framework::OpDesc>,
-                  ops::KLDivLossOpGradMaker<paddle::imperative::OpBase>);
+                  ops::KLDivLossOpGradMaker<paddle::imperative::OpBase>,
+                  KLDivInferShapeFunctor);
 REGISTER_OPERATOR(kldiv_loss_grad, ops::KLDivLossOpGrad,
                   ops::KLDivLossGradNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(
-    kldiv_loss, ops::KLDivLossKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::KLDivLossKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    kldiv_loss_grad,
-    ops::KLDivLossGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::KLDivLossGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/kldiv_loss_op.cu b/paddle/fluid/operators/kldiv_loss_op.cu
deleted file mode 100644
index 5226cb8c08e3d..0000000000000
--- a/paddle/fluid/operators/kldiv_loss_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/kldiv_loss_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    kldiv_loss,
-    ops::KLDivLossKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KLDivLossKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    kldiv_loss_grad,
-    ops::KLDivLossGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KLDivLossGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/kldiv_loss_op.h b/paddle/fluid/operators/kldiv_loss_op.h
deleted file mode 100644
index 5a6ef06f5eb1e..0000000000000
--- a/paddle/fluid/operators/kldiv_loss_op.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using Array1 = Eigen::DSizes<int64_t, 1>;
-
-template <typename T>
-struct KLDivLossForward {
-  HOSTDEVICE KLDivLossForward() {}
-
-  HOSTDEVICE T operator()(const T& target, const T& input) const {
-    if (target <= 0) {
-      return 0;
-    } else {
-      return target * (std::log(target) - input);
-    }
-  }
-};
-
-template <typename T>
-struct KLDivLossBackward {
-  HOSTDEVICE KLDivLossBackward() {}
-
-  HOSTDEVICE T operator()(const T& target, const T& grad) const {
-    if (target <= 0) {
-      return 0;
-    } else {
-      return static_cast<T>(-1.) * grad;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KLDivLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto* input = ctx.Input<Tensor>("X");
-    auto* target = ctx.Input<Tensor>("Target");
-    auto* loss = ctx.Output<Tensor>("Loss");
-    auto reduction = ctx.Attr<std::string>("reduction");
-
-    const int n = input->dims()[0];
-
-    loss->mutable_data<T>(ctx.GetPlace());
-    auto input_t = framework::EigenVector<T>::Flatten(*input);
-    auto target_t = framework::EigenVector<T>::Flatten(*target);
-    auto loss_t = framework::EigenVector<T>::Flatten(*loss);
-    auto output = target_t.binaryExpr(input_t, KLDivLossForward<T>());
-    if ("none" == reduction) {
-      loss_t.device(place) = output;
-    } else if ("batchmean" == reduction) {
-      auto output_sum = output.sum();
-      if (n > 0) {
-        loss_t.device(place) = output_sum / output_sum.constant(n);
-      } else {
-        loss_t.device(place) = output_sum;
-      }
-    } else if ("mean" == reduction) {
-      loss_t.device(place) = output.mean();
-    } else if ("sum" == reduction) {
-      loss_t.device(place) = output.sum();
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KLDivLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto* target = ctx.Input<Tensor>("Target");
-    auto reduction = ctx.Attr<std::string>("reduction");
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-
-    const int n = input_grad->dims()[0];
-    const int numel = input_grad->numel();
-    const int expand = numel / loss_grad->numel();
-
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    auto target_t = framework::EigenVector<T>::Flatten(*target);
-
-    auto input_grad_t = framework::EigenVector<T>::Flatten(*input_grad);
-    auto loss_grad_t = framework::EigenVector<T>::Flatten(*loss_grad);
-
-    auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand));
-    auto grad_t = target_t * loss_grad_expand;
-    input_grad_t.device(place) =
-        target_t.binaryExpr(grad_t, KLDivLossBackward<T>());
-
-    if ("mean" == reduction) {
-      input_grad_t.device(place) = input_grad_t / static_cast<T>(numel);
-    } else if ("batchmean" == reduction) {
-      input_grad_t.device(place) = input_grad_t / static_cast<T>(n);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc
index 322ae5df4cb87..eac181489aa9d 100644
--- a/paddle/fluid/operators/kldiv_loss_op_npu.cc
+++ b/paddle/fluid/operators/kldiv_loss_op_npu.cc
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the Licnse. */
 
-#include "paddle/fluid/operators/kldiv_loss_op.h"
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/kron_op.cc b/paddle/fluid/operators/kron_op.cc
index 58d51ab1c723f..60390016d66e3 100644
--- a/paddle/fluid/operators/kron_op.cc
+++ b/paddle/fluid/operators/kron_op.cc
@@ -17,9 +17,9 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/fluid/operators/kron_op.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -28,27 +28,6 @@ class KronOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "kron");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "kron");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "kron");
-
-    auto dim_x = ctx->GetInputDim("X");
-    auto dim_y = ctx->GetInputDim("Y");
-    auto rank_x = dim_x.size();
-    auto rank_y = dim_y.size();
-    auto rank = (rank_x > rank_y) ? rank_x : rank_y;
-
-    std::vector<int64_t> dim_out;
-    dim_out.reserve(rank);
-    for (int i = 0; i < rank; i++) {
-      int64_t dim_xi = (i < rank - rank_x) ? 1 : dim_x.at(i - (rank - rank_x));
-      int64_t dim_yi = (i < rank - rank_y) ? 1 : dim_y.at(i - (rank - rank_y));
-      dim_out.push_back(dim_xi == -1 || dim_yi == -1 ? -1 : dim_xi * dim_yi);
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(dim_out));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -175,30 +154,10 @@ class KronGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(kron, KronInferShapeFunctor,
+                            PD_INFER_META(phi::KronInferMeta));
 REGISTER_OPERATOR(kron, ops::KronOp, ops::KronOpMaker,
                   ops::KronGradOpMaker<paddle::framework::OpDesc>,
-                  ops::KronGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    kron, ops::KronKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::KronKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::KronKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::float16>,
-    ops::KronKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::KronKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::KronKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::KronKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<double>>);
-
+                  ops::KronGradOpMaker<paddle::imperative::OpBase>,
+                  KronInferShapeFunctor);
 REGISTER_OPERATOR(kron_grad, ops::KronGradOp);
-REGISTER_OP_CPU_KERNEL(
-    kron_grad, ops::KronGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::KronGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::KronGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::float16>,
-    ops::KronGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::KronGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::KronGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::KronGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/kron_op.cu b/paddle/fluid/operators/kron_op.cu
deleted file mode 100644
index e5124e6500750..0000000000000
--- a/paddle/fluid/operators/kron_op.cu
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/kron_op.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    kron, ops::KronKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KronKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::KronKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::float16>,
-    ops::KronKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::KronKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::KronKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::KronKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<double>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    kron_grad, ops::KronGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KronGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::KronGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>,
-    ops::KronGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::KronGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::KronGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::KronGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/kron_op.h b/paddle/fluid/operators/kron_op.h
deleted file mode 100644
index 274b47c03a4d3..0000000000000
--- a/paddle/fluid/operators/kron_op.h
+++ /dev/null
@@ -1,415 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "thrust/device_vector.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-// Process an element in the output, used with a parallel-for
-template <typename T>
-struct KronElemFunctor {
-  KronElemFunctor(const T* a, const T* b, T* out, const int64_t* shape_b,
-                  const int64_t* stride_a, const int64_t* stride_b,
-                  const int64_t* stride_out, int ndims)
-      : a_(a),
-        b_(b),
-        out_(out),
-        shape_b_(shape_b),
-        stride_a_(stride_a),
-        stride_b_(stride_b),
-        stride_out_(stride_out),
-        ndims_(ndims) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    // it computes 1 element in the output
-    int64_t index = idx;
-    int64_t index_a = 0;
-    int64_t index_b = 0;
-    for (int i = 0; i < ndims_; i++) {
-      auto pos_i = index / stride_out_[i];
-      index = index % stride_out_[i];
-      auto pos_ai = pos_i / shape_b_[i];
-      auto pos_bi = pos_i % shape_b_[i];
-      index_a += stride_a_[i] * pos_ai;
-      index_b += stride_b_[i] * pos_bi;
-    }
-    out_[idx] = a_[index_a] * b_[index_b];
-  }
-
- private:
-  const T* a_;
-  const T* b_;
-  T* out_;
-  const int64_t* shape_b_;
-  const int64_t* stride_a_;
-  const int64_t* stride_b_;
-  const int64_t* stride_out_;
-  const int ndims_;
-};
-
-template <typename DeviceContext, typename T>
-struct KronOpFunctor {
-  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& x,
-                  const framework::Tensor& y, framework::Tensor* out) {
-    int ndims = out->dims().size();
-    int64_t numel = out->numel();
-
-    const framework::DDim& dim_x = x.dims();
-    const framework::DDim& dim_y = y.dims();
-    const framework::DDim& dim_out = out->dims();
-    const framework::DDim stride_x = phi::stride(dim_x);
-    const framework::DDim stride_y = phi::stride(dim_y);
-    const framework::DDim stride_out = phi::stride(dim_out);
-
-    const int64_t *p_stride_x = nullptr, *p_stride_y = nullptr,
-                  *p_stride_out = nullptr, *p_shape_y = nullptr;
-#if defined(__NVCC__) || defined(__HIPCC__)
-    thrust::device_vector<int64_t> d_stride_x(ndims);
-    thrust::device_vector<int64_t> d_stride_y(ndims);
-    thrust::device_vector<int64_t> d_stride_out(ndims);
-    thrust::device_vector<int64_t> d_shape_y(ndims);
-    thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin());
-    thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin());
-    thrust::copy(stride_out.Get(), stride_out.Get() + ndims,
-                 d_stride_out.begin());
-    thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin());
-
-    p_stride_x = thrust::raw_pointer_cast(d_stride_x.data());
-    p_stride_y = thrust::raw_pointer_cast(d_stride_y.data());
-    p_stride_out = thrust::raw_pointer_cast(d_stride_out.data());
-    p_shape_y = thrust::raw_pointer_cast(d_shape_y.data());
-#else
-    p_stride_x = stride_x.Get();
-    p_stride_y = stride_y.Get();
-    p_stride_out = stride_out.Get();
-    p_shape_y = dim_y.Get();
-#endif
-
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    KronElemFunctor<T> functor(x.data<T>(), y.data<T>(), out->data<T>(),
-                               p_shape_y, p_stride_x, p_stride_y, p_stride_out,
-                               ndims);
-    for_range(functor);
-  }
-};
-
-template <typename T>
-struct KronGradElemFunctor {
-  KronGradElemFunctor(const T* dout, const T* A, const T* B, T* dout_a,
-                      T* dout_b, const int64_t* stride_dout,
-                      const int64_t* stride_a, const int64_t* stride_b,
-                      const int64_t* shape_b, const int64_t numel_a,
-                      const int64_t numel_b, const int ndims)
-      : dout_(dout),
-        A_(A),
-        B_(B),
-        dout_a_(dout_a),
-        dout_b_(dout_b),
-        stride_dout_(stride_dout),
-        stride_a_(stride_a),
-        stride_b_(stride_b),
-        shape_b_(shape_b),
-        numel_a_(numel_a),
-        numel_b_(numel_b),
-        ndims_(ndims) {}
-
-  HOSTDEVICE void operator()(int64_t idx) {
-    int64_t index = idx;
-    int64_t index_a = 0;
-    int64_t index_b = 0;
-    for (int i = 0; i < ndims_; i++) {
-      auto pos_i = index / stride_dout_[i];
-      index = index % stride_dout_[i];
-      auto pos_ai = pos_i / shape_b_[i];
-      auto pos_bi = pos_i % shape_b_[i];
-      index_a += stride_a_[i] * pos_ai;
-      index_b += stride_b_[i] * pos_bi;
-    }
-
-    if (dout_a_) {
-      size_t index_out_a = index_a * numel_b_ + index_b;
-      dout_a_[index_out_a] = dout_[idx] * B_[index_b];
-    }
-    if (dout_b_) {
-      size_t index_out_b = index_b * numel_a_ + index_a;
-      dout_b_[index_out_b] = dout_[idx] * A_[index_a];
-    }
-  }
-
- private:
-  const T* dout_;
-  const T* A_;
-  const T* B_;
-  T* dout_a_;
-  T* dout_b_;
-  const int64_t* stride_dout_;
-  const int64_t* stride_a_;
-  const int64_t* stride_b_;
-  const int64_t* shape_b_;
-  const int64_t numel_a_;
-  const int64_t numel_b_;
-  const int ndims_;
-};
-
-template <typename T>
-struct KronGradElemFunctor<platform::complex<T>> {
-  KronGradElemFunctor(const platform::complex<T>* dout,
-                      const platform::complex<T>* A,
-                      const platform::complex<T>* B,
-                      platform::complex<T>* dout_a,
-                      platform::complex<T>* dout_b, const int64_t* stride_dout,
-                      const int64_t* stride_a, const int64_t* stride_b,
-                      const int64_t* shape_b, const int64_t numel_a,
-                      const int64_t numel_b, const int ndims)
-      : dout_(dout),
-        A_(A),
-        B_(B),
-        dout_a_(dout_a),
-        dout_b_(dout_b),
-        stride_dout_(stride_dout),
-        stride_a_(stride_a),
-        stride_b_(stride_b),
-        shape_b_(shape_b),
-        numel_a_(numel_a),
-        numel_b_(numel_b),
-        ndims_(ndims) {}
-
-  HOSTDEVICE void operator()(int64_t idx) {
-    int64_t index = idx;
-    int64_t index_a = 0;
-    int64_t index_b = 0;
-    for (int i = 0; i < ndims_; i++) {
-      auto pos_i = index / stride_dout_[i];
-      index = index % stride_dout_[i];
-      auto pos_ai = pos_i / shape_b_[i];
-      auto pos_bi = pos_i % shape_b_[i];
-      index_a += stride_a_[i] * pos_ai;
-      index_b += stride_b_[i] * pos_bi;
-    }
-
-    if (dout_a_) {
-      size_t index_out_a = index_a * numel_b_ + index_b;
-      dout_a_[index_out_a] =
-          dout_[idx] *
-          platform::complex<T>(B_[index_b].real, -B_[index_b].imag);
-    }
-    if (dout_b_) {
-      size_t index_out_b = index_b * numel_a_ + index_a;
-      dout_b_[index_out_b] =
-          dout_[idx] *
-          platform::complex<T>(A_[index_a].real, -A_[index_a].imag);
-    }
-  }
-
- private:
-  const platform::complex<T>* dout_;
-  const platform::complex<T>* A_;
-  const platform::complex<T>* B_;
-  platform::complex<T>* dout_a_;
-  platform::complex<T>* dout_b_;
-  const int64_t* stride_dout_;
-  const int64_t* stride_a_;
-  const int64_t* stride_b_;
-  const int64_t* shape_b_;
-  const int64_t numel_a_;
-  const int64_t numel_b_;
-  const int ndims_;
-};
-
-template <typename DeviceContext, typename T>
-struct KronGradOpFunctor {
-  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& dout,
-                  const framework::Tensor& x, const framework::Tensor& y,
-                  framework::Tensor* dx, framework::Tensor* dy) {
-    int ndims = dout.dims().size();
-    int64_t numel = dout.numel();
-    int64_t numel_x = x.numel();
-    int64_t numel_y = y.numel();
-
-    const framework::DDim& dim_x = x.dims();
-    const framework::DDim& dim_y = y.dims();
-    const framework::DDim& dim_dout = dout.dims();
-
-    const framework::DDim stride_x = phi::stride(dim_x);
-    const framework::DDim stride_y = phi::stride(dim_y);
-    const framework::DDim stride_dout = phi::stride(dim_dout);
-
-    const int64_t* p_stride_x = nullptr;
-    const int64_t* p_stride_y = nullptr;
-    const int64_t* p_stride_dout = nullptr;
-    const int64_t* p_shape_y = nullptr;
-#if defined(__NVCC__) || defined(__HIPCC__)
-    thrust::device_vector<int64_t> d_stride_x(ndims);
-    thrust::device_vector<int64_t> d_stride_y(ndims);
-    thrust::device_vector<int64_t> d_stride_dout(ndims);
-    thrust::device_vector<int64_t> d_shape_y(ndims);
-    thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin());
-    thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin());
-    thrust::copy(stride_dout.Get(), stride_dout.Get() + ndims,
-                 d_stride_dout.begin());
-    thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin());
-
-    p_stride_x = thrust::raw_pointer_cast(d_stride_x.data());
-    p_stride_y = thrust::raw_pointer_cast(d_stride_y.data());
-    p_stride_dout = thrust::raw_pointer_cast(d_stride_dout.data());
-    p_shape_y = thrust::raw_pointer_cast(d_shape_y.data());
-#else
-    p_stride_x = stride_x.Get();
-    p_stride_y = stride_y.Get();
-    p_stride_dout = stride_dout.Get();
-    p_shape_y = dim_y.Get();
-#endif
-    // dout_x: dout * kron(ones(X), Y) re-aranged in shape (numel_x, numel_y)
-    // dout_y: dout * kron(X, ones(Y)) re-aranged in shaoe (numel_y, numel_x)
-    framework::Tensor dout_x;
-    T* p_dout_x = nullptr;
-    if (dx) {
-      dout_x.mutable_data<T>({numel_x, numel_y}, dev_ctx.GetPlace());
-      p_dout_x = dout_x.data<T>();
-    }
-    framework::Tensor dout_y;
-    T* p_dout_y = nullptr;
-    if (dy) {
-      dout_y.mutable_data<T>({numel_y, numel_x}, dev_ctx.GetPlace());
-      p_dout_y = dout_y.data<T>();
-    }
-
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    KronGradElemFunctor<T> func(dout.data<T>(), x.data<T>(), y.data<T>(),
-                                p_dout_x, p_dout_y, p_stride_dout, p_stride_x,
-                                p_stride_y, p_shape_y, numel_x, numel_y, ndims);
-    for_range(func);
-
-// reduce_sum along aixs 1
-#if defined(__NVCC__) || defined(__HIPCC__)
-    auto stream = dev_ctx.stream();  // it is a cuda device_context
-    if (dx) {
-      TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          dev_ctx, dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream);
-    }
-    if (dy) {
-      TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          dev_ctx, dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream);
-    }
-#else
-    auto* place = dev_ctx.eigen_device();
-    Eigen::array<int, 1> reduce_dim = {1};
-    if (dx) {
-      auto eigen_dout_x = framework::EigenMatrix<T>::Reshape(dout_x, 1);
-      auto eigen_vec_dx = framework::EigenVector<T>::Flatten(*dx);
-      eigen_vec_dx.device(*place) = eigen_dout_x.sum(reduce_dim);
-    }
-    if (dy) {
-      auto eigen_dout_y = framework::EigenMatrix<T>::Reshape(dout_y, 1);
-      auto eigen_vec_dy = framework::EigenVector<T>::Flatten(*dy);
-      eigen_vec_dy.device(*place) = eigen_dout_y.sum(reduce_dim);
-    }
-#endif
-  }
-};
-
-inline framework::Tensor UnsqueezeTo(const framework::Tensor& src, int ndims) {
-  const framework::DDim& shape = src.dims();
-  int rank = shape.size();
-  framework::Tensor res;
-  res.ShareDataWith(src);
-  PADDLE_ENFORCE_LE(
-      rank, ndims,
-      platform::errors::InvalidArgument(
-          "The input Tensor's rank should be less than or equal to ndims"
-          "Received input Tensor's rank = %d, ndims = %d",
-          rank, ndims));
-  if (rank < ndims) {
-    std::vector<int64_t> new_dim(ndims, 1);
-    for (int i = ndims - rank; i < ndims; i++) {
-      new_dim[i] = shape[i - ndims + rank];
-    }
-    res.Resize(phi::make_ddim(new_dim));
-  }
-  return res;
-}
-
-template <typename DeviceContext, typename T>
-class KronKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    int ndims = out->dims().size();
-    framework::Tensor xx = UnsqueezeTo(*x, ndims);
-    framework::Tensor yy = UnsqueezeTo(*y, ndims);
-
-    KronOpFunctor<DeviceContext, T> func;
-    func(dev_ctx, xx, yy, out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KronGradKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-    }
-    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-    }
-
-    int ndims = dout->dims().size();
-    framework::Tensor xx = UnsqueezeTo(*x, ndims);
-    framework::Tensor yy = UnsqueezeTo(*y, ndims);
-
-    framework::Tensor* pdxx = nullptr;
-    framework::Tensor* pdyy = nullptr;
-    framework::Tensor dxx;
-    framework::Tensor dyy;
-    if (dx) {
-      dxx = UnsqueezeTo(*dx, ndims);
-      pdxx = &dxx;
-    }
-
-    if (dy) {
-      dyy = UnsqueezeTo(*dy, ndims);
-      pdyy = &dyy;
-    }
-
-    KronGradOpFunctor<DeviceContext, T> func;
-    func(dev_ctx, *dout, xx, yy, pdxx, pdyy);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/kthvalue_op.cc b/paddle/fluid/operators/kthvalue_op.cc
index 2a79cee27814e..4c679d3026386 100644
--- a/paddle/fluid/operators/kthvalue_op.cc
+++ b/paddle/fluid/operators/kthvalue_op.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/kthvalue_op.h"
 #include <memory>
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -25,54 +26,6 @@ class KthvalueOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "kthvalue");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "kthvalue");
-    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "kthvalue");
-    auto input_dims = ctx->GetInputDim("X");
-    const int& dim_size = input_dims.size();
-    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
-    PADDLE_ENFORCE_LT(axis, dim_size,
-                      paddle::platform::errors::InvalidArgument(
-                          "the axis must be [-%d, %d), but received %d .",
-                          dim_size, dim_size, axis));
-    PADDLE_ENFORCE_GE(axis, -dim_size,
-                      paddle::platform::errors::InvalidArgument(
-                          "the axis must be [-%d, %d), but received %d .",
-                          dim_size, dim_size, axis));
-    if (axis < 0) axis += dim_size;
-    int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
-    PADDLE_ENFORCE_GE(
-        k, 1, paddle::platform::errors::InvalidArgument(
-                  "the k in the kthvalue must >= 1, but received %d .", k));
-    PADDLE_ENFORCE_GE(input_dims.size(), 1,
-                      paddle::platform::errors::InvalidArgument(
-                          "input of kthvalue must have >= 1d shape"));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_GE(
-          input_dims[axis], k,
-          paddle::platform::errors::InvalidArgument(
-              "input of kthvalue must have >= %d columns in axis of %d", k,
-              axis));
-    }
-    bool keepdim = ctx->Attrs().Get<bool>("keepdim");
-    std::vector<int64_t> dimvec;
-    for (int64_t i = 0; i < axis; i++) {
-      dimvec.emplace_back(input_dims[i]);
-    }
-    if (keepdim) {
-      dimvec.emplace_back(static_cast<int64_t>(1));
-    }
-    for (int64_t i = axis + 1; i < dim_size; i++) {
-      dimvec.emplace_back(input_dims[i]);
-    }
-    framework::DDim dims = phi::make_ddim(dimvec);
-    ctx->SetOutputDim("Out", dims);
-    ctx->SetOutputDim("Indices", dims);
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -155,20 +108,13 @@ class KthvalueGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(kthvalue, KthvalueInferShapeFunctor,
+                            PD_INFER_META(phi::KthvalueInferMeta));
+
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(kthvalue, ops::KthvalueOp, ops::KthvalueOpMaker,
                   ops::KthvalueGradOpMaker<paddle::framework::OpDesc>,
-                  ops::KthvalueGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    kthvalue, ops::KthvalueCPUKernel<paddle::platform::CPUPlace, float>,
-    ops::KthvalueCPUKernel<paddle::platform::CPUPlace, double>,
-    ops::KthvalueCPUKernel<paddle::platform::CPUPlace, int32_t>,
-    ops::KthvalueCPUKernel<paddle::platform::CPUPlace, int64_t>);
+                  ops::KthvalueGradOpMaker<paddle::imperative::OpBase>,
+                  KthvalueInferShapeFunctor);
 
 REGISTER_OPERATOR(kthvalue_grad, ops::KthvalueOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    kthvalue_grad,
-    ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, float>,
-    ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, double>,
-    ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, int32_t>,
-    ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/fluid/operators/kthvalue_op.cu b/paddle/fluid/operators/kthvalue_op.cu
deleted file mode 100644
index 4f30c58d37500..0000000000000
--- a/paddle/fluid/operators/kthvalue_op.cu
+++ /dev/null
@@ -1,279 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/kthvalue_op.h"
-#include "paddle/fluid/operators/top_k_function_cuda.h"
-#include "paddle/fluid/operators/top_k_v2_op.h"
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-#endif
-
-namespace paddle {
-namespace operators {
-
-int getBlockSize(int col) {
-  if (col > 512)
-    return 1024;
-  else if (col > 256 && col <= 512)
-    return 512;
-  else if (col > 128 && col <= 256)
-    return 256;
-  else if (col > 64 && col <= 128)
-    return 128;
-  else
-    return 64;
-}
-
-template <typename T>
-bool SortKthvalue(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor* input_tensor, const int64_t num_cols,
-                  const int64_t num_rows, const int k,
-                  framework::Tensor* out_tensor,
-                  framework::Tensor* indices_tensor) {
-  auto cu_stream = ctx.stream();
-  framework::Tensor input_indices;
-  const std::vector<int64_t> dims = {num_rows, num_cols};
-  auto dim = phi::make_ddim(dims);
-  input_indices.Resize(dim);
-  input_indices.mutable_data<int64_t>(ctx.GetPlace());
-  size_t temp_storage_bytes = -1;
-  int block_size = getBlockSize(num_cols);
-  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
-  unsigned int grid_size = num_rows < maxGridDimX
-                               ? static_cast<unsigned int>(num_rows)
-                               : maxGridDimX;
-  InitIndex<int64_t><<<grid_size, block_size, 0, cu_stream>>>(
-      input_indices.data<int64_t>(), num_rows, num_cols);
-  cub::CountingInputIterator<int64_t> counting_iter(0);
-  cub::TransformInputIterator<int64_t, SegmentOffsetIter,
-                              cub::CountingInputIterator<int64_t>>
-      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
-  T* sorted_values_ptr;
-  int64_t* sorted_indices_ptr;
-  framework::Tensor temp_values, temp_indices;
-  const T* input = input_tensor->data<T>();
-  T* values = out_tensor->data<T>();
-  int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
-  temp_values.Resize(dim);
-  temp_indices.Resize(dim);
-  sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
-  sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
-  auto err = cub::DeviceSegmentedRadixSort::SortPairs(
-      nullptr, temp_storage_bytes, input, sorted_values_ptr,
-      input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
-      num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-      cu_stream);
-#ifdef __HIPCC__
-  if (err != hipSuccess) {
-    LOG(ERROR) << "KthvalueOP failed as could not launch "
-                  "hipcub::DeviceSegmentedRadixSort::SortPairs, status: "
-               << hipGetErrorString(err);
-    return false;
-  }
-#else
-  if (err != cudaSuccess) {
-    LOG(ERROR) << "KthvalueOP failed as could not launch "
-                  "cub::DeviceSegmentedRadixSort::SortPairs, status: "
-               << cudaGetErrorString(err);
-    return false;
-  }
-#endif
-  framework::Tensor temp_storage;
-  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
-
-  err = cub::DeviceSegmentedRadixSort::SortPairs(
-      temp_storage.data<uint8_t>(), temp_storage_bytes, input,
-      sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
-      num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
-      0, sizeof(T) * 8, cu_stream);
-#ifdef __HIPCC__
-  if (err != hipSuccess) {
-    LOG(ERROR) << "KthvalueOP failed as could not launch "
-                  "hipcub::DeviceSegmentedRadixSort::SortPairs, "
-               << temp_storage_bytes << ", status: " << hipGetErrorString(err);
-    return false;
-  }
-#else
-  if (err != cudaSuccess) {
-    LOG(ERROR) << "KthvalueOP failed as could not launch "
-                  "cub::DeviceSegmentedRadixSort::SortPairs, "
-               << temp_storage_bytes << ", status: " << cudaGetErrorString(err);
-    return false;
-  }
-#endif
-  auto& dev = *ctx.eigen_device();
-  const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, k - 1};
-  const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, 1};
-  auto e_indices = framework::EigenMatrix<int64_t>::From(*indices_tensor, dim);
-  auto e_tmp_indices = framework::EigenMatrix<int64_t>::From(
-      static_cast<const framework::Tensor>(temp_indices));
-  std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(1)};
-  dim = phi::make_ddim(odims);
-  auto e_values = framework::EigenMatrix<T>::From(*out_tensor, dim);
-  auto e_tmp_values = framework::EigenMatrix<T>::From(
-      static_cast<const framework::Tensor>(temp_values));
-
-  EigenSlice<std::decay_t<decltype(dev)>, int64_t, 2>::Eval(
-      dev, e_indices, e_tmp_indices, slice_indices, slice_sizes);
-  EigenSlice<std::decay_t<decltype(dev)>, T, 2>::Eval(
-      dev, e_values, e_tmp_values, slice_indices, slice_sizes);
-  return true;
-}
-
-template <typename DeviceContext, typename T>
-class KthvalueOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* output = ctx.Output<framework::Tensor>("Out");
-    auto* indices = ctx.Output<framework::Tensor>("Indices");
-    int k = static_cast<int>(ctx.Attr<int>("k"));
-    int axis = static_cast<int>(ctx.Attr<int>("axis"));
-    bool keepdim = static_cast<bool>(ctx.Attr<bool>("keepdim"));
-    const auto& in_dims = input->dims();
-    if (axis < 0) axis += in_dims.size();
-    auto out_dims = output->dims();
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    if (axis == in_dims.size() - 1) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
-      PADDLE_ENFORCE_EQ(SortKthvalue<T>(dev_ctx, input, input_width,
-                                        input_height, k, output, indices),
-                        true, platform::errors::External(
-                                  "KthvalueOP: Error when use cub sorting"));
-      return;
-    } else {
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-      if (!keepdim) {
-        std::vector<int> tmp_out_shape;
-        for (int i = 0; i < axis; i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        tmp_out_shape.emplace_back(1);
-        for (int i = axis + 1; i < in_dims.size(); i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        framework::DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
-        output->Resize(tmp_out_dims);
-        indices->Resize(tmp_out_dims);
-      }
-      framework::DDim trans_dims(in_dims);
-      framework::DDim trans_out_dims(in_dims);
-      for (int i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-        trans_out_dims[i] = in_dims[trans[i]];
-      }
-      trans_out_dims[in_dims.size() - 1] = 1;
-      framework::Tensor trans_input;
-      trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
-                                                   &trans_input, trans);
-      framework::Tensor trans_ind, trans_out;
-      trans_ind.mutable_data<int64_t>(trans_out_dims, ctx.GetPlace());
-      trans_out.mutable_data<T>(trans_out_dims, ctx.GetPlace());
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-      PADDLE_ENFORCE_EQ(
-          SortKthvalue<T>(dev_ctx, &trans_input, input_width, input_height, k,
-                          &trans_out, &trans_ind),
-          true,
-          platform::errors::External("KthvalueOP: Error when use cub sorting"));
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, trans_ind, indices, trans);
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
-                                                   output, trans);
-      if (!keepdim) {
-        output->Resize(out_dims);
-        indices->Resize(out_dims);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KthvalueOpGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(context.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<framework::Tensor>("Indices");
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    int axis = context.Attr<int>("axis");
-    int k = static_cast<int>(context.Attr<int>("k"));
-    const auto& in_dims = x->dims();
-    auto out_dims = indices->dims();
-    if (axis < 0) axis += in_dims.size();
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    const T* out_grad_data = out_grad->data<T>();
-    const int64_t* indices_data = indices->data<int64_t>();
-    int pre, n, post;
-    GetDims(in_dims, axis, &pre, &n, &post);
-    auto& dev_ctx = context.cuda_device_context();
-    int block_size = getBlockSize(post * k);
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
-    int grid_size = std::min(max_blocks, pre);
-    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
-        out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    kthvalue,
-    ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    kthvalue_grad,
-    ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
diff --git a/paddle/fluid/operators/kthvalue_op.h b/paddle/fluid/operators/kthvalue_op.h
deleted file mode 100644
index 15df0a10c6992..0000000000000
--- a/paddle/fluid/operators/kthvalue_op.h
+++ /dev/null
@@ -1,281 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <iostream>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/transpose_op.h"
-
-namespace paddle {
-namespace operators {
-template <typename T, typename Type>
-static void getKthvalue(Type input_height, Type input_width, int input_dim,
-                        const framework::Tensor* input, T* t_out,
-                        Type* t_indices, const int& k) {
-  bool partial_sort_flag = (k * 64) < input_width;
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    std::vector<std::pair<T, Type>> col_vec;
-    col_vec.reserve(input_width);
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
-      }
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
-      }
-    }
-    if (partial_sort_flag) {
-      std::partial_sort(
-          col_vec.begin(), col_vec.begin() + k, col_vec.end(),
-          [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-            return (!std::isnan(static_cast<double>(l.first)) &&
-                    std::isnan(static_cast<double>(r.first))) ||
-                   (l.first < r.first);
-          });
-    } else {
-      std::nth_element(
-          col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
-          [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-            return (!std::isnan(static_cast<double>(l.first)) &&
-                    std::isnan(static_cast<double>(r.first))) ||
-                   (l.first < r.first);
-          });
-    }
-    t_out[i] = col_vec[k - 1].first;
-    t_indices[i] = col_vec[k - 1].second;
-  }
-}
-
-template <typename T, typename Type>
-static void kthvalueAssign(const Type& input_height, const Type& input_width,
-                           const int& input_dim, const framework::Tensor* input,
-                           const framework::Tensor* indices, T* output_data) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
-      output_data[i * input_width + e_indices(0)] = e_input(0);
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices =
-          framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
-      output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class KthvalueCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<framework::Tensor>("X");
-    auto* output = context.Output<framework::Tensor>("Out");
-    auto* indices = context.Output<framework::Tensor>("Indices");
-    const auto& in_dims = input->dims();
-    int k = static_cast<int>(context.Attr<int>("k"));
-    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    if (axis < 0) axis += in_dims.size();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
-    auto out_dims = output->dims();
-    if (axis == in_dims.size() - 1) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      getKthvalue<T, int64_t>(input_height, input_width, in_dims.size(), input,
-                              output_data, indices_data, k);
-    } else {
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-      if (!keepdim) {
-        std::vector<int> tmp_out_shape;
-        for (int i = 0; i < axis; i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        tmp_out_shape.emplace_back(1);
-        for (int i = axis + 1; i < in_dims.size(); i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        framework::DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
-        output->Resize(tmp_out_dims);
-        indices->Resize(tmp_out_dims);
-      }
-      framework::DDim trans_dims(in_dims);
-      framework::DDim trans_out_dims(in_dims);
-
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-        trans_out_dims[i] = in_dims[trans[i]];
-      }
-      trans_out_dims[in_dims.size() - 1] = 1;
-      framework::Tensor trans_inp;
-      trans_inp.mutable_data<T>(trans_dims, context.GetPlace());
-      int ndims = trans.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
-                                                  &trans_inp, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-      framework::Tensor tmp_out, tmp_indices;
-      T* t_out = tmp_out.mutable_data<T>(trans_out_dims, context.GetPlace());
-      auto* t_ind =
-          tmp_indices.mutable_data<int64_t>(trans_out_dims, context.GetPlace());
-
-      getKthvalue<T, int64_t>(input_height, input_width, in_dims.size(),
-                              &trans_inp, t_out, t_ind, k);
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_context, tmp_indices, indices, trans);
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  output, trans);
-      if (!keepdim) {
-        output->Resize(out_dims);
-        indices->Resize(out_dims);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KthvalueGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<framework::Tensor>("Indices");
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
-    auto in_dims = x->dims();
-    auto out_dims = indices->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    if (!keepdim) {
-      std::vector<int> tmp_out_shape;
-      for (int i = 0; i < axis; i++) {
-        tmp_out_shape.emplace_back(out_dims[i]);
-      }
-      tmp_out_shape.emplace_back(1);
-      for (int i = axis + 1; i < in_dims.size(); i++) {
-        tmp_out_shape.emplace_back(out_dims[i - 1]);
-      }
-      out_dims = phi::make_ddim(tmp_out_shape);
-    }
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    if (axis == in_dims.size() - 1) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
-      if (keepdim) {
-        kthvalueAssign(input_height, input_width, in_dims.size(), out_grad,
-                       indices, x_grad_data);
-      } else {
-        auto& dev_context =
-            context.template device_context<platform::CPUDeviceContext>();
-        framework::Tensor out_grad_tmp, indices_tmp;
-        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
-        indices_tmp.mutable_data<int64_t>(indices->dims(),
-                                          dev_context.GetPlace());
-        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
-                              &out_grad_tmp);
-        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
-                              &indices_tmp);
-        out_grad_tmp.Resize(out_dims);
-        indices_tmp.Resize(out_dims);
-        kthvalueAssign(input_height, input_width, in_dims.size(), &out_grad_tmp,
-                       &indices_tmp, x_grad_data);
-      }
-    } else {
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(out_dims.size() - 1);
-      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-      framework::DDim trans_dims(out_dims);
-      framework::DDim trans_in_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = out_dims[trans[i]];
-        trans_in_dims[i] = in_dims[trans[i]];
-      }
-      framework::Tensor trans_dO, trans_ind;
-      trans_dO.mutable_data<T>(trans_dims, context.GetPlace());
-      trans_ind.mutable_data<int64_t>(trans_dims, context.GetPlace());
-      int ndims = trans.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-      if (keepdim) {
-        TransCompute<platform::CPUDeviceContext, T>(
-            ndims, dev_context, *out_grad, &trans_dO, trans);
-        TransCompute<platform::CPUDeviceContext, int64_t>(
-            ndims, dev_context, *indices, &trans_ind, trans);
-      } else {
-        framework::Tensor out_grad_tmp, indices_tmp;
-        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
-        indices_tmp.mutable_data<int64_t>(indices->dims(),
-                                          dev_context.GetPlace());
-        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
-                              &out_grad_tmp);
-        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
-                              &indices_tmp);
-        out_grad_tmp.Resize(out_dims);
-        indices_tmp.Resize(out_dims);
-        TransCompute<platform::CPUDeviceContext, T>(
-            ndims, dev_context, out_grad_tmp, &trans_dO, trans);
-        TransCompute<platform::CPUDeviceContext, int64_t>(
-            ndims, dev_context, indices_tmp, &trans_ind, trans);
-      }
-      const int64_t input_height = phi::product(
-          phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
-      const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
-      framework::Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_in_dims, context.GetPlace());
-      memset(t_out, 0, x_grad->numel() * sizeof(T));
-      kthvalueAssign<T, int64_t>(input_height, input_width, in_dims.size(),
-                                 &trans_dO, &trans_ind, t_out);
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  x_grad, trans);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index 412ae3c49b5f3..c0a4b88fc76fd 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -758,12 +758,14 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
 */
 template <typename T, typename U, typename ScaleT = U,
           typename MaskType = uint8_t>
-void ln_bwd_1024_kernel_driver(
-    const platform::CUDADeviceContext &dev_ctx, const int rows, const int cols,
-    float epsilon, const T *x_ptr, const ScaleT *scale_ptr, const U *mean_ptr,
-    const U *var_ptr, const T *dout_ptr, T *dx_ptr, ScaleT *dscale_ptr,
-    ScaleT *dbias_ptr, const MaskType *mask_ptr = nullptr,
-    T factor = static_cast<T>(0), T *d_dropout_src_ptr = nullptr) {
+void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows,
+                               const int cols, float epsilon, const T *x_ptr,
+                               const ScaleT *scale_ptr, const U *mean_ptr,
+                               const U *var_ptr, const T *dout_ptr, T *dx_ptr,
+                               ScaleT *dscale_ptr, ScaleT *dbias_ptr,
+                               const MaskType *mask_ptr = nullptr,
+                               T factor = static_cast<T>(0),
+                               T *d_dropout_src_ptr = nullptr) {
   auto stream = dev_ctx.stream();
   if (cols == 1024) {
     // step-1: compute dx and reduced part results of dscale and dbias.
@@ -1334,8 +1336,7 @@ static void LayerNormBackward(
     const U *mean, const U *var, T *d_x,
     LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *d_scale,
     LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *d_bias, float epsilon,
-    int64_t batch_size, int64_t feature_size,
-    const platform::CUDADeviceContext &dev_ctx) {
+    int64_t batch_size, int64_t feature_size, const phi::GPUContext &dev_ctx) {
   auto stream = dev_ctx.stream();
 #ifdef __HIPCC__
   const int kMaxBlockDim = 256;
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index e7d676479be0c..224ab748dab6c 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/layer_norm_op.h"
-
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -278,10 +277,3 @@ REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
                   ops::LayerNormGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp,
                   ops::LayerNormGradNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(
-    layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
deleted file mode 100644
index dfe73d3727132..0000000000000
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ /dev/null
@@ -1,289 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
-#include "paddle/fluid/operators/layer_norm_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
-                                               const T *input,
-                                               std::vector<int> input_shape,
-                                               const T *bias, const T *scale,
-                                               T *output, T *mean, T *variance,
-                                               int begin_norm_axis, float eps) {
-  const auto x_dims = phi::make_ddim(input_shape);
-  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
-  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
-  switch (GetDesiredBlockDim(feature_size)) {
-    FIXED_BLOCK_DIM_CASE(
-        LayerNormForward<T, T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
-            input, scale, bias, output, mean, variance, eps, feature_size));
-    default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Product from begin_norm_axis to end in layer_norm must be larger "
-          "than 1"));
-      break;
-  }
-}
-
-template <typename T>
-class LayerNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using U = LayerNormParamType<T>;
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto *scale = ctx.Input<Tensor>("Scale");
-    auto *bias = ctx.Input<Tensor>("Bias");
-    auto *x = ctx.Input<Tensor>("X");
-
-    auto *y = ctx.Output<Tensor>("Y");
-    auto *mean = ctx.Output<Tensor>("Mean");
-    auto *var = ctx.Output<Tensor>("Variance");
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    const auto x_dims = x->dims();
-    auto *x_data = x->data<T>();
-    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
-    auto *mean_data = mean->mutable_data<U>(ctx.GetPlace());
-    auto *var_data = var->mutable_data<U>(ctx.GetPlace());
-
-    auto *void_scale_data = (scale == nullptr ? nullptr : scale->data());
-    auto *void_bias_data = (bias == nullptr ? nullptr : bias->data());
-
-    framework::proto::VarType::Type x_dtype =
-        framework::TransToProtoVarType(x->dtype());
-    framework::proto::VarType::Type scale_bias_dtype;
-    if (void_scale_data != nullptr) {
-      scale_bias_dtype = framework::TransToProtoVarType(scale->dtype());
-      if (void_bias_data != nullptr) {
-        PADDLE_ENFORCE_EQ(scale_bias_dtype,
-                          framework::TransToProtoVarType(bias->dtype()),
-                          platform::errors::InvalidArgument(
-                              "Thie Scale and Bias of layer_norm op "
-                              "should have the same data type."));
-      }
-    } else {
-      scale_bias_dtype = (void_bias_data != nullptr
-                              ? framework::TransToProtoVarType(bias->dtype())
-                              : x_dtype);
-    }
-
-    bool is_scale_bias_same_dtype_with_x = x_dtype == scale_bias_dtype;
-    if (!is_scale_bias_same_dtype_with_x) {
-      PADDLE_ENFORCE_EQ(scale_bias_dtype,
-                        framework::DataTypeTrait<U>::DataType(),
-                        platform::errors::InvalidArgument(
-                            "Unsupported data type of Scale and Bias: %s",
-                            framework::DataTypeToString(scale_bias_dtype)));
-    }
-
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
-    int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
-
-    auto stream = ctx.cuda_device_context().stream();
-
-#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \
-  do {                                                                     \
-    switch (GetDesiredBlockDim(feature_size)) {                            \
-      FIXED_BLOCK_DIM_CASE(                                                \
-          LayerNormForward<T, U, kBlockDim, IsScaleBiasSameDTypeWithX><<<  \
-              batch_size, kBlockDim, 0, stream>>>(                         \
-              x_data, static_cast<const ScaleBiasT *>(void_scale_data),    \
-              static_cast<const ScaleBiasT *>(void_bias_data), y_data,     \
-              mean_data, var_data, epsilon, feature_size));                \
-      default:                                                             \
-        PADDLE_THROW(platform::errors::InvalidArgument(                    \
-            "Product from begin_norm_axis to end must be larger than 1")); \
-        break;                                                             \
-    }                                                                      \
-  } while (0)
-
-#ifdef PADDLE_WITH_CUDA
-    bool can_call_1024_kernel = false;
-    if (feature_size == 1024 && scale != nullptr && bias != nullptr) {
-      can_call_1024_kernel = true;
-    }
-    if (can_call_1024_kernel) {
-      const int WARPS_M = 4;
-      const int WARPS_N = 1;
-      const int THREADS_PER_WARP = 32;
-      const int BYTES_PER_LDG = 16;
-      const int VecSize = BYTES_PER_LDG / sizeof(T);
-
-      const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;
-      const int ROWS_PER_CTA = WARPS_M;
-
-      const int grid = static_cast<int>(
-          std::ceil(batch_size / static_cast<float>(ROWS_PER_CTA)));
-      if (is_scale_bias_same_dtype_with_x) {
-        ln_fwd_1024_kernel<T, U, T, VecSize, WARPS_M, WARPS_N,
-                           BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
-            batch_size, feature_size, epsilon, x_data,
-            static_cast<const T *>(void_scale_data),
-            static_cast<const T *>(void_bias_data), mean_data, var_data,
-            y_data);
-      } else {
-        ln_fwd_1024_kernel<T, U, U, VecSize, WARPS_M, WARPS_N,
-                           BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
-            batch_size, feature_size, epsilon, x_data,
-            static_cast<const U *>(void_scale_data),
-            static_cast<const U *>(void_bias_data), mean_data, var_data,
-            y_data);
-      }
-    } else {
-#endif
-      if (is_scale_bias_same_dtype_with_x) {
-        PADDLE_LAUNCH_LAYERNORM_FWD(T, true);
-      } else {
-        PADDLE_LAUNCH_LAYERNORM_FWD(U, false);
-      }
-#ifdef PADDLE_WITH_CUDA
-    }
-#endif
-
-#undef PADDLE_LAUNCH_LAYERNORM_FWD
-  }
-};
-
-template <typename T>
-class LayerNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using U = LayerNormParamType<T>;
-    const float epsilon = ctx.Attr<float>("epsilon");
-    // d_x, d_scale, d_bias may be nullptr
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    auto *x = ctx.Input<Tensor>("X");
-    auto *mean = ctx.Input<Tensor>("Mean");
-    auto *var = ctx.Input<Tensor>("Variance");
-    auto *scale = ctx.Input<Tensor>("Scale");
-    auto *bias = ctx.Input<Tensor>("Bias");
-    auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-
-    const auto &x_dims = x->dims();
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
-    int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
-
-    auto *x_data = x->data<T>();
-    auto *d_y_data = d_y->data<T>();
-
-    auto *mean_data = mean->data<U>();
-    auto *var_data = var->data<U>();
-
-    auto *d_x_data =
-        (d_x == nullptr ? nullptr : d_x->mutable_data<T>(ctx.GetPlace()));
-
-    framework::proto::VarType::Type x_dtype =
-        framework::TransToProtoVarType(x->dtype());
-    framework::proto::VarType::Type scale_bias_dtype;
-    if (scale != nullptr) {
-      scale_bias_dtype = framework::TransToProtoVarType(scale->dtype());
-    } else {
-      // FIXME(zengjinle): do not find a better way to get the right
-      // data type of the d_scale and d_bias if scale == nullptr.
-      auto *bias = ctx.Input<Tensor>("Bias");
-      if (bias != nullptr) {
-        scale_bias_dtype = framework::TransToProtoVarType(bias->dtype());
-      } else {
-        scale_bias_dtype = x_dtype;
-      }
-    }
-
-#define PADDLE_LAUNCH_LAYERNORM_BWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \
-  do {                                                                     \
-    auto *scale_data =                                                     \
-        (scale == nullptr ? nullptr : scale->data<ScaleBiasT>());          \
-    auto *d_scale_data =                                                   \
-        (d_scale == nullptr ? nullptr : d_scale->mutable_data<ScaleBiasT>( \
-                                            ctx.GetPlace()));              \
-    auto *d_bias_data =                                                    \
-        (d_bias == nullptr ? nullptr : d_bias->mutable_data<ScaleBiasT>(   \
-                                           ctx.GetPlace()));               \
-    auto *d_x_data =                                                       \
-        (d_x == nullptr ? nullptr : d_x->mutable_data<T>(ctx.GetPlace())); \
-    LayerNormBackward<T, U, IsScaleBiasSameDTypeWithX>(                    \
-        x_data, d_y_data, scale_data, mean_data, var_data, d_x_data,       \
-        d_scale_data, d_bias_data, epsilon, batch_size, feature_size,      \
-        ctx.cuda_device_context());                                        \
-  } while (0)
-
-    if (scale_bias_dtype == x_dtype) {
-      PADDLE_LAUNCH_LAYERNORM_BWD(T, true);
-    } else {
-      PADDLE_LAUNCH_LAYERNORM_BWD(U, false);
-    }
-
-#undef PADDLE_LAUNCH_LAYERNORM_BWD
-  }
-};
-
-template class LayerNormDirectCUDAFunctor<float>;
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>);
-#elif CUDNN_VERSION_MIN(8, 1, 0)
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>);
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::bfloat16>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>);
-#endif
diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h
deleted file mode 100644
index 9d70b7cf70743..0000000000000
--- a/paddle/fluid/operators/layer_norm_op.h
+++ /dev/null
@@ -1,374 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__)
-#include "paddle/fluid/operators/jit/kernels.h"
-#endif
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-class CUDADeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-// Wrap RowwiseMean and ColwiseMean.
-// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is
-// significantly faster. Unlike the RowwiseMean and ColwiseMean, the
-// implementation only considers 2D.
-template <typename DeviceContext, typename T>
-struct RowwiseMean2D {
-  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx);
-
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* vec);
-};
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename T>
-class RowwiseMean2D<platform::CUDADeviceContext, T> {
- public:
-  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx)
-      : left_(left), right_(right) {
-    framework::DDim ones_dim({right_});
-    divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
-    phi::funcs::set_constant(dev_ctx, &divisor_, 1.0 / right);
-  }
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
-        false, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0.,
-        out->data<T>());
-  }
-
- private:
-  int left_;
-  int right_;
-  framework::Tensor divisor_;
-};
-#endif
-
-template <typename T>
-class RowwiseMean2D<platform::CPUDeviceContext, T> {
- public:
-  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    row_mean_(context, input, out);
-  }
-
- private:
-  phi::funcs::RowwiseMean<platform::CPUDeviceContext, T> row_mean_;
-};
-
-template <typename DeviceContext, typename T>
-struct ColwiseSum2D {
-  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx);
-
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* vec);
-};
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename T>
-class ColwiseSum2D<platform::CUDADeviceContext, T> {
- public:
-  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx)
-      : left_(left), right_(right) {
-    framework::DDim ones_dim({left_});
-    divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
-    phi::funcs::set_constant(dev_ctx, &divisor_, 1.0);
-  }
-
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
-        true, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0.,
-        out->data<T>());
-  }
-
- private:
-  int left_;
-  int right_;
-  framework::Tensor divisor_;
-};
-#endif
-
-template <typename T>
-class ColwiseSum2D<platform::CPUDeviceContext, T> {
- public:
-  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    col_wise_(context, input, out);
-  }
-
- private:
-  phi::funcs::ColwiseSum<platform::CPUDeviceContext, T> col_wise_;
-};
-
-template <typename T>
-struct SubAndSquareFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
-};
-
-template <typename T>
-struct DivAndSqrtFunctor {
-  explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; }
-  inline HOSTDEVICE T operator()(T a, T b) const {
-    return a / (sqrt(b + epsilon_));
-  }
-
- private:
-  T epsilon_;
-};
-
-template <typename T>
-struct MulInvVarFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const {
-    return a * std::sqrt(1.0 / b);
-  }
-};
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DataLayout = framework::DataLayout;
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename T>
-class LayerNormDirectCUDAFunctor {
- public:
-  void operator()(gpuStream_t stream, const T* input,
-                  std::vector<int> input_shape, const T* bias, const T* scale,
-                  T* output, T* mean, T* variance, int begin_norm_axis,
-                  float eps);
-};
-#endif
-
-template <typename DeviceContext, typename T>
-class LayerNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto x = *ctx.Input<Tensor>("X");
-
-    auto* y = ctx.Output<Tensor>("Y");
-    auto* mean = ctx.Output<Tensor>("Mean");
-    auto* var = ctx.Output<Tensor>("Variance");
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    const auto x_dims = x.dims();
-
-    y->mutable_data<T>(ctx.GetPlace());
-    mean->mutable_data<T>(ctx.GetPlace());
-    var->mutable_data<T>(ctx.GetPlace());
-
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-    framework::DDim matrix_shape({left, right});
-
-    x.Resize(matrix_shape);
-    Tensor out;
-    out.ShareDataWith(*y);
-    out.Resize(matrix_shape);
-
-#if defined(PADDLE_WITH_CUDA) || defined(_WIN32) || defined(__APPLE__) || \
-    defined(__OSX__)
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    RowwiseMean2D<DeviceContext, T> row_mean(left, right, ctx.device_context());
-
-    // get mean
-    row_mean(dev_ctx, x, mean);
-
-    // get variance
-    ElementwiseComputeEx<SubAndSquareFunctor<T>, DeviceContext, T>(
-        ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor<T>(), &out);
-    row_mean(dev_ctx, out, var);
-
-    // get x_norm
-    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-        ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &out);
-    ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
-        ctx, &out, var, /*axis*/ 0,
-        DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &out);
-
-    if (scale) {
-      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-          ctx, &out, scale, /*axis*/ 1, MulFunctor<T>(), &out);
-    }
-    if (bias) {
-      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
-          ctx, &out, bias, /*axis*/ 1, AddFunctor<T>(), &out);
-    }
-#else
-    PADDLE_ENFORCE_EQ(mean->numel(), left,
-                      platform::errors::InvalidArgument(
-                          "mean's length (%d) is not equal with expected (%d).",
-                          mean->numel(), left));
-    PADDLE_ENFORCE_EQ(var->numel(), left,
-                      platform::errors::InvalidArgument(
-                          "var's length (%d) is not equal with expected (%d).",
-                          var->numel(), left));
-    if (scale) {
-      PADDLE_ENFORCE_EQ(
-          scale->numel(), right,
-          platform::errors::InvalidArgument(
-              "scale's length (%d) is not equal with expected (%d).",
-              scale->numel(), right));
-    }
-    if (bias) {
-      PADDLE_ENFORCE_EQ(
-          bias->numel(), right,
-          platform::errors::InvalidArgument(
-              "bias's length (%d) is not equal with expected (%d).",
-              bias->numel(), right));
-    }
-
-    auto ker =
-        jit::KernelFuncs<jit::LayerNormTuple<T>, platform::CPUPlace>::Cache()
-            .At(right);
-    ker(x.data<T>(), out.data<T>(), mean->data<T>(), var->data<T>(),
-        scale ? scale->data<T>() : nullptr, bias ? bias->data<T>() : nullptr,
-        static_cast<int>(left), static_cast<const float>(epsilon), right);
-#endif
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LayerNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto x = *ctx.Input<Tensor>("X");
-    auto* mean = ctx.Input<Tensor>("Mean");
-    auto* var = ctx.Input<Tensor>("Variance");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    // init output
-    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    const auto& x_dims = x.dims();
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-    framework::DDim matrix_shape({left, right});
-
-    d_y.Resize(matrix_shape);
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    ColwiseSum2D<DeviceContext, T> colwise_sum(left, right,
-                                               ctx.device_context());
-
-    Tensor temp;
-    Tensor temp_norm;
-    if (d_scale || d_x) {
-      x.Resize(matrix_shape);
-      temp.mutable_data<T>(matrix_shape, ctx.GetPlace());
-
-      temp_norm.mutable_data<T>(matrix_shape, ctx.GetPlace());
-      // get x_norm
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &temp_norm);
-      ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
-          ctx, &temp_norm, var, /*axis*/ 0,
-          DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &temp_norm);
-    }
-
-    if (d_bias) {
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      colwise_sum(dev_ctx, d_y, d_bias);
-    }
-    if (d_scale) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-          ctx, &temp_norm, &d_y, /*axis*/ 0, MulFunctor<T>(), &temp);
-      colwise_sum(dev_ctx, temp, d_scale);
-    }
-
-    if (d_x) {
-      framework::DDim vec_shape({left});
-      d_x->mutable_data<T>(ctx.GetPlace());
-      auto dx_dim = d_x->dims();
-      Tensor temp_vec;
-      temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace());
-
-      RowwiseMean2D<DeviceContext, T> row_mean(left, right,
-                                               ctx.device_context());
-
-      if (d_scale) {
-        // dy_dx
-        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-            ctx, &d_y, scale, /*axis*/ 1, MulFunctor<T>(), &temp);
-        framework::TensorCopy(temp, ctx.GetPlace(), ctx.device_context(), d_x);
-
-        // dy_dmean_dx
-        row_mean(dev_ctx, temp, &temp_vec);
-        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
-
-        // dy_var_dx
-        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-            ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
-      } else {
-        // dy_dx
-        framework::TensorCopy(d_y, ctx.GetPlace(), ctx.device_context(), d_x);
-
-        // dy_dmean_dx
-        row_mean(dev_ctx, d_y, &temp_vec);
-        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
-
-        // dy_var_dx
-        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-            ctx, &d_y, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
-      }
-      // dy_var_dx
-      row_mean(dev_ctx, temp, &temp_vec);
-      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-          ctx, &temp_norm, &temp_vec, /*axis*/ 0, MulFunctor<T>(), &temp);
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, d_x, &temp, /*axis*/ 0, SubFunctor<T>(), d_x);
-
-      ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
-          ctx, d_x, var, /*axis*/ 0,
-          DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), d_x);
-      d_x->Resize(dx_dim);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc
index c88880b43fff9..3c7e5bf9593e0 100644
--- a/paddle/fluid/operators/layer_norm_op_npu.cc
+++ b/paddle/fluid/operators/layer_norm_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/layer_norm_op_xpu.cc b/paddle/fluid/operators/layer_norm_op_xpu.cc
index 0480a354c8bd8..3b21a55f8df0d 100644
--- a/paddle/fluid/operators/layer_norm_op_xpu.cc
+++ b/paddle/fluid/operators/layer_norm_op_xpu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lgamma_op.cc b/paddle/fluid/operators/lgamma_op.cc
index 148fb05afcfd9..72c6b41efa989 100644
--- a/paddle/fluid/operators/lgamma_op.cc
+++ b/paddle/fluid/operators/lgamma_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/lgamma_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -35,16 +38,6 @@ This operator performs elementwise lgamma for input $X$.
 class LgammaOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Lgamma");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Lgamma");
-
-    auto in_dims = ctx->GetInputDim("X");
-
-    ctx->SetOutputDim("Out", in_dims);
-    ctx->ShareLoD("X", "Out");
-  }
 };
 
 template <typename T>
@@ -83,17 +76,12 @@ class LgammaGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(lgamma, LgammaInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
+
 REGISTER_OPERATOR(lgamma, ops::LgammaOp, ops::LgammaOpMaker,
                   ops::LgammaGradMaker<paddle::framework::OpDesc>,
-                  ops::LgammaGradMaker<paddle::imperative::OpBase>);
+                  ops::LgammaGradMaker<paddle::imperative::OpBase>,
+                  LgammaInferShapeFunctor);
 
 REGISTER_OPERATOR(lgamma_grad, ops::LgammaGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    lgamma, ops::LgammaKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LgammaKernel<paddle::platform::CPUDeviceContext, double>)
-
-REGISTER_OP_CPU_KERNEL(
-    lgamma_grad,
-    ops::LgammaGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LgammaGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/lgamma_op.cu b/paddle/fluid/operators/lgamma_op.cu
deleted file mode 100644
index b9f273727b00b..0000000000000
--- a/paddle/fluid/operators/lgamma_op.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <unsupported/Eigen/SpecialFunctions>
-#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/operators/lgamma_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct CudaLgammaFunctor {
-  __device__ __forceinline__ T operator()(const T x) const {
-    return Eigen::numext::lgamma(x);
-  }
-};
-
-template <typename T>
-class LgammaKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    auto& dev_ctx = context.device_context<platform::CUDADeviceContext>();
-    std::vector<const framework::Tensor*> ins = {x};
-    std::vector<framework::Tensor*> outs = {out};
-    auto functor = CudaLgammaFunctor<T>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                              &outs, functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    lgamma, ops::LgammaKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LgammaKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    lgamma_grad,
-    ops::LgammaGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LgammaGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/lgamma_op.h b/paddle/fluid/operators/lgamma_op.h
deleted file mode 100644
index 674054e745732..0000000000000
--- a/paddle/fluid/operators/lgamma_op.h
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <unsupported/Eigen/SpecialFunctions>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct LgammaFunctor {
-  LgammaFunctor(const T* input, T* output, int64_t numel)
-      : input_(input), output_(output), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    output_[idx] = Eigen::numext::lgamma(input_[idx]);
-  }
-
- private:
-  const T* input_;
-  T* output_;
-  int64_t numel_;
-};
-
-template <typename T>
-struct LgammaGradFunctor {
-  LgammaGradFunctor(const T* dout, const T* x, T* output, int64_t numel)
-      : dout_(dout), x_(x), output_(output), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    output_[idx] = dout_[idx] * Eigen::numext::digamma(x_[idx]);
-  }
-
- private:
-  const T* dout_;
-  const T* x_;
-  T* output_;
-  int64_t numel_;
-};
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class LgammaKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-
-    auto numel = x->numel();
-    auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace(),
-                                          size_t(x->numel() * sizeof(T)));
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    LgammaFunctor<T> functor(x_data, out_data, numel);
-    for_range(functor);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LgammaGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    const framework::Tensor* d_out =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    const framework::Tensor* x = ctx.Input<framework::Tensor>("X");
-    framework::Tensor* d_x =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto numel = d_out->numel();
-    auto* dout_data = d_out->data<T>();
-    auto* x_data = x->data<T>();
-    auto* dx_data = d_x->mutable_data<T>(
-        ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    LgammaGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/log_softmax_op.cc b/paddle/fluid/operators/log_softmax_op.cc
index 0e69b397e04c7..da38f906b9bd3 100644
--- a/paddle/fluid/operators/log_softmax_op.cc
+++ b/paddle/fluid/operators/log_softmax_op.cc
@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/log_softmax_op.h"
 #include <string>
 #include <unordered_map>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -24,10 +27,6 @@ class LogSoftmaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    return UnaryOpUnchangedInferShapeCheckAxis(ctx);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -123,18 +122,11 @@ class LogSoftmaxGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-
+DECLARE_INFER_SHAPE_FUNCTOR(log_softmax, LogSoftmaxInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMetaCheckAxis));
 REGISTER_OPERATOR(log_softmax, ops::LogSoftmaxOp, ops::LogSoftmaxOpMaker,
                   ops::LogSoftmaxOpInferVarType,
                   ops::LogSoftmaxGradOpMaker<paddle::framework::OpDesc>,
-                  ops::LogSoftmaxGradOpMaker<paddle::imperative::OpBase>);
+                  ops::LogSoftmaxGradOpMaker<paddle::imperative::OpBase>,
+                  LogSoftmaxInferShapeFunctor);
 REGISTER_OPERATOR(log_softmax_grad, ops::LogSoftmaxGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    log_softmax,
-    ops::LogSoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LogSoftmaxKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    log_softmax_grad,
-    ops::LogSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LogSoftmaxGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu
deleted file mode 100644
index 8770abdac838f..0000000000000
--- a/paddle/fluid/operators/log_softmax_op.cu
+++ /dev/null
@@ -1,485 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <limits>
-#include "paddle/fluid/operators/log_softmax_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/funcs/functors.h"
-
-namespace paddle {
-namespace operators {
-
-#define LAUNCH_WARP_FORWAR_COMPUTE(near_greater_power_of_two)                \
-  case near_greater_power_of_two:                                            \
-    ComputeLogSoftmaxForwardInWarp<                                          \
-        T, AccT, near_greater_power_of_two><<<blocks, threads, 0, stream>>>( \
-        dst, src, outer_size, dim_size);                                     \
-    break;
-
-template <typename T, int KernelWarpSize>
-__device__ __forceinline__ T WarpReduceSum(T value) {
-#pragma unroll
-  for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) {
-    T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, value, offset);
-    value = value + sum_val;
-  }
-  return value;
-}
-
-template <typename T, int KernelWarpSize>
-__device__ __forceinline__ T WarpReduceMax(T value) {
-#pragma unroll
-  for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) {
-    T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, value, offset);
-    value = max(value, max_val);
-  }
-  return value;
-}
-
-int GetNearGreaterPowerOfTwo(int value) {
-  int log2_value = 0;
-  while ((1 << log2_value) < value) {
-    ++log2_value;
-  }
-  return 1 << log2_value;
-}
-
-template <typename T, typename AccT, int NearGreaterPowerOfTwo>
-__global__ void ComputeLogSoftmaxForwardInWarp(T *dst, const T *src,
-                                               int batch_size,
-                                               int element_count) {
-  constexpr int near_greater_power_of_two = NearGreaterPowerOfTwo;
-  constexpr int kernel_warp_size =
-      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
-  constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size;
-  int batch_id = blockDim.y * blockIdx.x + threadIdx.y;
-
-  int thread_in_warp_idx = threadIdx.x;
-
-  // 1.read data from global memory to registers
-  AccT elements[warp_iter];
-  // set effective_element_count as the num of elements when warps do effective
-  // work
-  // set effective_element_count as 0, when warps do ineffective work
-  int effective_element_count = (batch_id < batch_size) ? element_count : 0;
-  for (int it = 0; it < warp_iter; ++it) {
-    int element_index = thread_in_warp_idx + it * kernel_warp_size;
-    if (element_index < effective_element_count) {
-      elements[it] =
-          static_cast<AccT>(src[batch_id * element_count + element_index]);
-    } else {
-      elements[it] = -std::numeric_limits<AccT>::infinity();
-    }
-  }
-
-  // 2.compute max_value. For each thread, loop all registers to find max
-  AccT max_value = elements[0];
-#pragma unroll
-  for (int it = 1; it < warp_iter; ++it) {
-    max_value = (max_value > elements[it]) ? max_value : elements[it];
-  }
-  max_value = WarpReduceMax<AccT, kernel_warp_size>(max_value);
-
-  // 3.For each warp, accumulate all thread registers
-  AccT sum = 0.0f;
-#pragma unroll
-  for (int it = 0; it < warp_iter; ++it) {
-    sum += std::exp(elements[it] - max_value);
-  }
-  sum = WarpReduceSum<AccT, kernel_warp_size>(sum);
-
-  // 4.store result.
-  sum = std::log(sum);
-#pragma unroll
-  for (int it = 0; it < warp_iter; ++it) {
-    int element_index = thread_in_warp_idx + it * kernel_warp_size;
-    if (element_index < effective_element_count) {
-      dst[batch_id * element_count + element_index] =
-          static_cast<T>(elements[it] - max_value - sum);
-    } else {
-      break;
-    }
-  }
-}
-
-template <typename T, typename AccT>
-void LaunchSoftmaxForwardForLastAxis(T *dst, const T *src, int dim_size,
-                                     int outer_size, gpuStream_t stream) {
-  int threads_per_block = 128;
-  int near_greater_power_of_two = GetNearGreaterPowerOfTwo(dim_size);
-  int kernel_warp_size =
-      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
-  int warps_per_block = (threads_per_block / kernel_warp_size);
-  int blocks = (outer_size + warps_per_block - 1) / warps_per_block;
-  dim3 threads(kernel_warp_size, warps_per_block, 1);
-
-  switch (near_greater_power_of_two) {
-    LAUNCH_WARP_FORWAR_COMPUTE(1);
-    LAUNCH_WARP_FORWAR_COMPUTE(2);
-    LAUNCH_WARP_FORWAR_COMPUTE(4);     // dim_size: 3~4
-    LAUNCH_WARP_FORWAR_COMPUTE(8);     // dim_size: 5~8
-    LAUNCH_WARP_FORWAR_COMPUTE(16);    // dim_size: 9~16
-    LAUNCH_WARP_FORWAR_COMPUTE(32);    // dim_size: 17~32
-    LAUNCH_WARP_FORWAR_COMPUTE(64);    // dim_size: 33~64
-    LAUNCH_WARP_FORWAR_COMPUTE(128);   // dim_size 65~128
-    LAUNCH_WARP_FORWAR_COMPUTE(256);   // dim_size 129~256
-    LAUNCH_WARP_FORWAR_COMPUTE(512);   // dim_size 257~512
-    LAUNCH_WARP_FORWAR_COMPUTE(1024);  // dim_size 513~1024
-
-    default:
-      break;
-  }
-}
-
-// Returns the final item after reduce operation along block.x.
-// Firstly, get shared memory(smem) offset, find the starting position for every
-// y.
-// Secondly, initialise every smem position with value 'val' of thread itself.
-// Thirdly, apply standard reduction along x direction as below:
-//
-//   -> x direction
-// [o o o o o o o o]    time 0
-//  |     |/     /
-//  |    /|    /
-//  |  /  |  /
-//  |/    |/
-// [o o o o x x x x]    time 1
-//  | |/ /
-//  |/|/
-// [o o x x x x x x]    time 2
-//  |/
-// [o x x x x x x x]    time 3
-//
-// Finally, return the first item.
-// Imaging multiple reductions executed in paralell along y axis,
-// Note that when blockDim.x is not 1, it's a EVEN number in all cases,
-// and the size of shared memory is even as well.
-template <typename T, template <typename> class Functor>
-__forceinline__ __device__ T BlockReduceAlongDimX(T *shared, T val) {
-  Functor<T> func;
-  // This reduction is not Block-wise reduction, only reduce along block.x.
-  // therefore the shared mem has offsets for different block.y.
-  shared += threadIdx.y * blockDim.x;
-  shared[threadIdx.x] = val;
-  int offset = blockDim.x / 2;
-
-  while (offset > 0) {
-    __syncthreads();
-    if (threadIdx.x < offset) {
-      shared[threadIdx.x] =
-          func(shared[threadIdx.x], shared[threadIdx.x + offset]);
-    }
-    offset /= 2;
-  }
-  __syncthreads();
-  return shared[0];
-}
-
-template <typename T, typename AccT>
-__global__ void LogSoftmaxForwardCUDAKernelNotLastAxis(
-    T *output, const T *input, int outer_size, int dim_size, int inner_size) {
-  extern __shared__ unsigned char smem[];
-  auto sdata = reinterpret_cast<AccT *>(smem);
-
-  const int outer_stride = inner_size * dim_size;
-  const int dim_stride = inner_size;
-
-  for (int x_id = blockIdx.x; x_id < outer_size; x_id += gridDim.x) {
-    for (int y_id = blockIdx.y * blockDim.y + threadIdx.y; y_id < inner_size;
-         y_id += blockDim.y * gridDim.y) {
-      const int data_offset = x_id * outer_stride + y_id;
-      // When blockDim.x==1, no block.x-reduction opetaions are needed.
-      // And threadIdx.x is 0 all the time, so the for-loops below are literally
-      // loops (No parallel executions). Loop all elements along axis and
-      // calculate the Max, Sum and (input[id]-Max-log(Sum)) to get the final
-      // log_softmax values along that axis.
-      // 1. reduce max
-      AccT max_value = -std::numeric_limits<AccT>::infinity();
-      // For one thread, iterate all items it responsable for, and get
-      // max_value.
-      // If there are N threads, N max_value will be returned.
-      for (int d = threadIdx.x; d < dim_size; d += blockDim.x) {
-        const AccT value =
-            static_cast<AccT>(input[data_offset + d * dim_stride]);
-        max_value = phi::funcs::MaxFunctor<AccT>()(max_value, value);
-      }
-      // If there are more than 1 threads along block x, reduce all max_values
-      // and get the global max_value, which is the max value along "axis".
-      // If there is only one thread along block x, no need to reduce, as the
-      // 'max_value' is the global max_value.
-      if (blockDim.x > 1) {
-        max_value = BlockReduceAlongDimX<AccT, phi::funcs::MaxFunctor>(
-            sdata, max_value);
-      }
-
-      // 2. reduce sum
-      AccT sum = 0;
-      // Below is the same execution as '1. reduce max'
-      for (int d = threadIdx.x; d < dim_size; d += blockDim.x) {
-        sum += std::exp(static_cast<AccT>(input[data_offset + d * dim_stride]) -
-                        max_value);
-      }
-      if (blockDim.x > 1) {
-        sum = BlockReduceAlongDimX<AccT, phi::funcs::AddFunctor>(sdata, sum);
-      }
-
-      // 3. input-max-log_sum and write to output
-      for (int d = threadIdx.x; d < dim_size; d += blockDim.x) {
-        output[data_offset + d * dim_stride] = static_cast<T>(
-            static_cast<AccT>(input[data_offset + d * dim_stride]) - max_value -
-            std::log(sum));
-      }
-    }
-  }
-}
-
-// block.y covers inner_size. Threads along the x axis process dim_size
-// elements, and make sure not to exceed the 1024 threads per block.
-// Note that dim_threads namely blockDim.x is either 1 or a even number.
-inline dim3 GetBlockSize(int dim_size, int inner_size) {
-  int inner_threads = inner_size;
-  inner_threads = std::min(inner_threads, 1024);
-  int dim_threads = 1;
-
-  while (dim_threads * inner_threads <= 1024 && dim_threads <= dim_size) {
-    dim_threads *= 2;
-  }
-  dim_threads /= 2;
-  return dim3(dim_threads, inner_threads);
-}
-
-// First cover the y axis as many blocks as possible.
-// Then cover the x axis as many blocks as possible,
-// and make sure not to exceed the max_active_blocks.
-inline dim3 GetGridSize(dim3 block, int max_active_blocks, int outer_size,
-                        int dim_size, int inner_size) {
-  int inner_blocks = (inner_size + block.y - 1) / block.y;
-  if (inner_blocks > max_active_blocks) inner_blocks = max_active_blocks;
-
-  int outer_blocks = (max_active_blocks + inner_blocks - 1) / inner_blocks;
-  if (outer_blocks > outer_size) outer_blocks = outer_size;
-  return dim3(outer_blocks, inner_blocks);
-}
-
-// When designing grid size and block size, priority is given to block size,
-// and grid will be determined according to the maximum number of active blocks,
-// which is set by as a experience value.
-template <typename T, typename Kernel>
-void ComputeLaunchConfigure(Kernel k, int outer_size, int dim_size,
-                            int inner_size, dim3 &grid, dim3 &block,
-                            int &shared_mem, int num_sm) {
-  block = GetBlockSize(dim_size, inner_size);
-  int block_threads = block.x * block.y;
-  shared_mem = block.x == 1 ? 0 : block_threads * sizeof(T);
-  int max_active_blocks = num_sm * 2;
-  grid =
-      GetGridSize(block, max_active_blocks, outer_size, dim_size, inner_size);
-}
-
-template <typename T, typename MPDType>
-void LaunchLogSoftmaxForwardCUDAKernelNotLastAxis(T *output_data,
-                                                  const T *input_data,
-                                                  int outer_size, int dim_size,
-                                                  int inner_size, int num_sm,
-                                                  gpuStream_t stream) {
-  int shared_mem;
-  dim3 grid;
-  dim3 block;
-
-  ComputeLaunchConfigure<MPDType>(
-      &LogSoftmaxForwardCUDAKernelNotLastAxis<T, MPDType>, outer_size, dim_size,
-      inner_size, grid, block, shared_mem, num_sm);
-
-  LogSoftmaxForwardCUDAKernelNotLastAxis<
-      T, MPDType><<<grid, block, shared_mem, stream>>>(
-      output_data, input_data, outer_size, dim_size, inner_size);
-}
-
-template <typename T>
-class LogSoftmaxKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
-  using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const auto *x = context.Input<framework::Tensor>("X");
-    auto *out = context.Output<framework::Tensor>("Out");
-    const auto *input_data = x->data<T>();
-    auto *output_data = out->mutable_data<T>(context.GetPlace());
-
-    const int rank = x->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-
-    int dim_size = x->dims()[axis];
-    int inner_size = 1;
-    for (int i = axis + 1; i < x->dims().size(); ++i) {
-      inner_size *= x->dims()[i];
-    }
-    int outer_size = SizeToAxis(axis, x->dims());
-    gpuStream_t stream = context.cuda_device_context().stream();
-    int num_sm = context.cuda_device_context().GetSMCount();
-
-    if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) {
-      LaunchSoftmaxForwardForLastAxis<T, MPDType>(output_data, input_data,
-                                                  dim_size, outer_size, stream);
-    } else {
-      LaunchLogSoftmaxForwardCUDAKernelNotLastAxis<T, MPDType>(
-          output_data, input_data, outer_size, dim_size, inner_size, num_sm,
-          stream);
-    }
-  }
-};
-
-// Backward below
-#define LAUNCH_WARP_BACKWARD_COMPUTE(near_greater_power_of_two)              \
-  case near_greater_power_of_two:                                            \
-    ComputeLogSoftmaxBackwardInWarp<                                         \
-        T, AccT, near_greater_power_of_two><<<blocks, threads, 0, stream>>>( \
-        output, grad_output, grad_input, outer_size, dim_size);              \
-    break;
-
-template <typename T, typename AccT, int NearGreaterPowerOfTwo>
-__global__ void ComputeLogSoftmaxBackwardInWarp(const T *output,
-                                                const T *grad_output,
-                                                T *grad_input, int batch_size,
-                                                int element_count) {
-  constexpr int near_greater_power_of_two = NearGreaterPowerOfTwo;
-  constexpr int kernel_warp_size =
-      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
-  constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size;
-  int batch_id = blockDim.y * blockIdx.x + threadIdx.y;
-
-  int thread_in_warp_idx = threadIdx.x;
-
-  // 1.read data from global memory to registers
-  AccT output_register[warp_iter];
-  AccT grad_output_register[warp_iter];
-  int effective_element_count = (batch_id < batch_size) ? element_count : 0;
-  for (int iter = 0; iter < warp_iter; ++iter) {
-    int element_index = thread_in_warp_idx + iter * kernel_warp_size;
-    if (element_index < effective_element_count) {
-      output_register[iter] =
-          static_cast<AccT>(output[batch_id * element_count + element_index]);
-      grad_output_register[iter] = static_cast<AccT>(
-          grad_output[batch_id * element_count + element_index]);
-    } else {
-      output_register[iter] = static_cast<AccT>(0);
-      grad_output_register[iter] = static_cast<AccT>(0);
-    }
-  }
-
-  // 2. For each warp, accumulate all thread registers
-  AccT sum = grad_output_register[0];
-#pragma unroll
-  for (int iter = 1; iter < warp_iter; ++iter) {
-    sum += grad_output_register[iter];
-  }
-  sum = WarpReduceSum<AccT, kernel_warp_size>(sum);
-
-// 3. write result in grad_input
-#pragma unroll
-  for (int iter = 0; iter < warp_iter; ++iter) {
-    int element_index = thread_in_warp_idx + iter * kernel_warp_size;
-    if (element_index < effective_element_count) {
-      grad_input[batch_id * element_count + element_index] = static_cast<T>(
-          (grad_output_register[iter] - std::exp(output_register[iter]) * sum));
-    }
-  }
-}
-
-template <typename T, typename AccT>
-void LaunchSoftmaxBackwardForLastAxis(T *grad_input, const T *grad_output,
-                                      const T *output, int dim_size,
-                                      int outer_size, gpuStream_t stream) {
-  int threads_per_block = 128;
-  int near_greater_power_of_two = GetNearGreaterPowerOfTwo(dim_size);
-  int kernel_warp_size =
-      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
-  int warps_per_block = (threads_per_block / kernel_warp_size);
-  int blocks = (outer_size + warps_per_block - 1) / warps_per_block;
-  dim3 threads(kernel_warp_size, warps_per_block, 1);
-
-  switch (near_greater_power_of_two) {
-    LAUNCH_WARP_BACKWARD_COMPUTE(1);     // dim_size: 1
-    LAUNCH_WARP_BACKWARD_COMPUTE(2);     // dim_size: 2
-    LAUNCH_WARP_BACKWARD_COMPUTE(4);     // dim_size: 3~4
-    LAUNCH_WARP_BACKWARD_COMPUTE(8);     // dim_size: 5~8
-    LAUNCH_WARP_BACKWARD_COMPUTE(16);    // dim_size: 9~16
-    LAUNCH_WARP_BACKWARD_COMPUTE(32);    // dim_size: 17~32
-    LAUNCH_WARP_BACKWARD_COMPUTE(64);    // dim_size: 33~64
-    LAUNCH_WARP_BACKWARD_COMPUTE(128);   // dim_size: 65~128
-    LAUNCH_WARP_BACKWARD_COMPUTE(256);   // dim_size: 129~256
-    LAUNCH_WARP_BACKWARD_COMPUTE(512);   // dim_size: 257~512
-    LAUNCH_WARP_BACKWARD_COMPUTE(1024);  // dim_size: 513~1024
-
-    default:
-      break;
-  }
-}
-
-template <typename T>
-class LogSoftmaxGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
-  using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const auto *out = context.Input<framework::Tensor>("Out");
-    const auto *d_out =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    const auto *out_data = out->data<T>();
-    const auto *d_out_data = d_out->data<T>();
-    auto *d_x_data = d_x->mutable_data<T>(context.GetPlace());
-
-    const int rank = out->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-
-    int dim_size = out->dims()[axis];
-    int inner_size = 1;
-    for (int i = axis + 1; i < out->dims().size(); ++i) {
-      inner_size *= out->dims()[i];
-    }
-    int outer_size = SizeToAxis(axis, out->dims());
-    gpuStream_t stream = context.cuda_device_context().stream();
-
-    if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) {
-      LaunchSoftmaxBackwardForLastAxis<T, MPDType>(
-          d_x_data, d_out_data, out_data, dim_size, outer_size, stream);
-    } else {
-      LogSoftmaxGradFunctor<platform::CUDADeviceContext, T>()(
-          context.template device_context<platform::CUDADeviceContext>(), out,
-          d_out, d_x, axis);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    log_softmax, ops::LogSoftmaxKernel<plat::CUDADeviceContext, float>,
-    ops::LogSoftmaxKernel<plat::CUDADeviceContext, double>,
-    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::bfloat16>);
-REGISTER_OP_CUDA_KERNEL(
-    log_softmax_grad, ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, float>,
-    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, double>,
-    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::bfloat16>);
diff --git a/paddle/fluid/operators/log_softmax_op.h b/paddle/fluid/operators/log_softmax_op.h
deleted file mode 100644
index 162087a75662d..0000000000000
--- a/paddle/fluid/operators/log_softmax_op.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-static inline int CanonicalAxis(const int axis, const int rank) {
-  if (axis < 0) {
-    return axis + rank;
-  }
-  return axis;
-}
-
-static inline size_t SizeToAxis(const int axis, const framework::DDim dims) {
-  size_t size = 1;
-  for (int i = 0; i < axis; i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-static inline size_t SizeFromAxis(const int axis, const framework::DDim dims) {
-  size_t size = 1;
-  for (int i = axis; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-template <typename T>
-struct ValueClip {
-  HOSTDEVICE T operator()(const T& x) const {
-    const T kThreshold = static_cast<T>(-64.);
-    return x < kThreshold ? kThreshold : x;
-  }
-};
-
-template <typename DeviceContext, typename T>
-struct LogSoftmaxFunctor {
-  void operator()(const DeviceContext& context, const framework::Tensor* X,
-                  framework::Tensor* Y, const int axis) {
-    constexpr int kBatchDim = 0;
-    constexpr int kClassDim = 1;
-    constexpr int kAxisDim = 1;
-
-    int axis_dim = X->dims()[axis];
-    const int n = SizeToAxis(axis, X->dims());
-    const int d = SizeFromAxis(axis, X->dims());
-    framework::DDim dim_2d{n, d};
-
-    auto logits = EigenMatrix<T>::From(*X, dim_2d);
-    auto log_softmax = EigenMatrix<T>::From(*Y, dim_2d);
-
-    const int batch_size = logits.dimension(kBatchDim);
-    const int num_classes = logits.dimension(kClassDim);
-    const int num_remain = num_classes / axis_dim;
-
-    Eigen::DSizes<int, 1> along_axis(kAxisDim);
-    Eigen::DSizes<int, 2> batch_classes(batch_size, num_classes);
-    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
-    Eigen::DSizes<int, 3> batch_one_remain(batch_size, 1, num_remain);
-    Eigen::DSizes<int, 3> one_axis_one(1, axis_dim, 1);
-    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
-    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
-
-    // For numerical stability, logits should be shifted by maximum number along
-    // axis, calculate shifted_logits into log_softmax tensor for memory reuse.
-    if (num_remain == 1) {
-      // axis == -1, axis and class in same dimension, calculate along
-      // class dimension directly for higher performance
-      log_softmax.device(*context.eigen_device()) =
-          (logits -
-           logits.maximum(along_axis)
-               .eval()
-               .reshape(batch_by_one)
-               .broadcast(one_by_class))
-              .unaryExpr(ValueClip<T>());
-    } else {
-      // axis != -1, class dimension split into (axis, remain), max and sum
-      // should be calculated along axis dimension
-      log_softmax.device(*context.eigen_device()) =
-          (logits.reshape(batch_axis_remain) -
-           logits.reshape(batch_axis_remain)
-               .maximum(along_axis)
-               .eval()
-               .reshape(batch_one_remain)
-               .broadcast(one_axis_one)
-               .reshape(batch_classes))
-              .unaryExpr(ValueClip<T>());
-    }
-
-    log_softmax.device(*context.eigen_device()) =
-        log_softmax -
-        log_softmax.exp()
-            .eval()
-            .reshape(batch_axis_remain)
-            .sum(along_axis)
-            .log()
-            .broadcast(one_axis);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LogSoftmaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Out = context.Output<framework::Tensor>("Out");
-    const int rank = X->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-
-    // allocate memory on device.
-    Out->mutable_data<T>(context.GetPlace());
-
-    if (X->numel() != 0) {
-      LogSoftmaxFunctor<DeviceContext, T>()(
-          context.template device_context<DeviceContext>(), X, Out, axis);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-struct LogSoftmaxGradFunctor {
-  void operator()(const DeviceContext& context, const framework::Tensor* Y,
-                  const framework::Tensor* dY, framework::Tensor* dX,
-                  const int axis) {
-    constexpr int kBatchDim = 0;
-    constexpr int kClassDim = 1;
-
-    const int n = SizeToAxis(axis, Y->dims());
-    const int d = SizeFromAxis(axis, Y->dims());
-    framework::DDim dim_2d{n, d};
-
-    auto y = EigenMatrix<T>::From(*Y, dim_2d);
-    auto dy = EigenMatrix<T>::From(*dY, dim_2d);
-    auto dx = EigenMatrix<T>::From(*dX, dim_2d);
-
-    const int axis_dim = Y->dims()[axis];
-    const int batch_size = y.dimension(kBatchDim);
-    const int num_classes = y.dimension(kClassDim);
-    const int num_remain = num_classes / axis_dim;
-
-    Eigen::DSizes<int, 1> along_class(kClassDim);
-    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
-    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
-
-    dx.device(*context.eigen_device()) =
-        dy -
-        (y.exp()) * (dy.reshape(batch_axis_remain)
-                         .sum(along_class)
-                         .broadcast(one_axis));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LogSoftmaxGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* Out = context.Input<framework::Tensor>("Out");
-    auto* dOut =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    const int rank = Out->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-
-    // allocate memory on device.
-    dX->mutable_data<T>(context.GetPlace());
-
-    if (Out->numel() != 0) {
-      LogSoftmaxGradFunctor<DeviceContext, T>()(
-          context.template device_context<DeviceContext>(), Out, dOut, dX,
-          axis);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/log_softmax_op_npu.cc b/paddle/fluid/operators/log_softmax_op_npu.cc
index 5795f1dffac78..6ce21aec9215a 100644
--- a/paddle/fluid/operators/log_softmax_op_npu.cc
+++ b/paddle/fluid/operators/log_softmax_op_npu.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/log_softmax_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
 
 namespace paddle {
 namespace operators {
@@ -27,7 +28,7 @@ class LogSoftmaxNPUKernel : public framework::OpKernel<T> {
     auto* X = ctx.Input<framework::Tensor>("X");
     auto* Out = ctx.Output<framework::Tensor>("Out");
     const int rank = X->dims().size();
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
     Out->mutable_data<T>(ctx.GetPlace());
 
     if (X->numel() != 0) {
@@ -47,7 +48,7 @@ class LogSoftmaxGradNPUKernel : public framework::OpKernel<T> {
     auto* dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto* dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     const int rank = dOut->dims().size();
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
 
     // allocate memory on device.
     dX->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 65297abe3e49b..88d70d9bb7dae 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -221,7 +221,7 @@ class LRNOp : public framework::OperatorWithKernel {
       auto ar = paddle::framework::AttrReader(attrs);
       const std::string data_format = ar.Get<std::string>("data_format");
       auto dl = framework::StringToDataLayout(data_format);
-      // Some models may have intentionally set "AnyLayout" for pool
+      // Some models may have intentionally set "AnyLayout" for lrn
       // op. Treat this as NCHW (default data_format value)
       if (dl != framework::DataLayout::kAnyLayout) {
         return framework::OpKernelType(expected_kernel_type.data_type_,
diff --git a/paddle/fluid/operators/lstsq_op.cu b/paddle/fluid/operators/lstsq_op.cu
index 92c9857f0b942..10e2867bf2953 100644
--- a/paddle/fluid/operators/lstsq_op.cu
+++ b/paddle/fluid/operators/lstsq_op.cu
@@ -17,9 +17,11 @@
 
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/lstsq_op.h"
 #include "paddle/fluid/operators/qr_op.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
+#include "paddle/phi/kernels/triangular_solve_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -70,6 +72,10 @@ class LstsqCUDAKernel : public framework::OpKernel<T> {
     Tensor tau = dito.Fill(tau_dims_vec, 0);
     auto tau_data = tau.mutable_data<T>(context.GetPlace());
 
+    using Context =
+        typename framework::ConvertToPhiContext<DeviceContext>::TYPE;
+    auto& phi_dev_ctx = static_cast<const Context&>(dev_ctx);
+
     if (m >= n) {
       Tensor tmp_x = dito.Transpose(new_x);
       Tensor tmp_y = dito.Transpose(new_y);
@@ -93,8 +99,9 @@ class LstsqCUDAKernel : public framework::OpKernel<T> {
       Tensor slice_y = dito.Slice(trans_y, {-2}, {0}, {min_mn});
 
       // Step 3, solve R X = Y
-      triangular_solve<DeviceContext, T>(dev_ctx, res_r, slice_y, solution,
-                                         true, false, false);
+      phi::TriangularSolveKernel<T, Context>(phi_dev_ctx, res_r, slice_y, true,
+                                             false, false, solution);
+
     } else {
       auto x_data = new_x.mutable_data<T>(context.GetPlace());
       auto y_data = new_y.mutable_data<T>(context.GetPlace());
@@ -105,8 +112,8 @@ class LstsqCUDAKernel : public framework::OpKernel<T> {
 
       // Step 2, solve R^H Z = Y
       Tensor trans_r = dito.Transpose(new_x);
-      triangular_solve<DeviceContext, T>(dev_ctx, trans_r, new_y, solution,
-                                         true, true, false);
+      phi::TriangularSolveKernel<T, Context>(phi_dev_ctx, trans_r, new_y, true,
+                                             true, false, solution);
 
       // Step 3, X <- Q Z
       BatchedOrgqr<DeviceContext, T>(dev_ctx, batch_count, n, n, min_mn, x_data,
diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h
index 3cbbc62e7bec9..520722dafcbea 100644
--- a/paddle/fluid/operators/lstsq_op.h
+++ b/paddle/fluid/operators/lstsq_op.h
@@ -22,7 +22,6 @@
 #include "paddle/fluid/operators/math/matrix_solve.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/operators/triangular_solve_op.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h
index f323e2e041d99..2414ae68438fd 100644
--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -15,12 +15,13 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/set_value_op.h"
 #include "paddle/fluid/operators/svd_helper.h"
-#include "paddle/fluid/operators/triangular_solve_op.h"
-#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
+#include "paddle/phi/kernels/triangular_solve_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -403,11 +404,12 @@ void LU_Unpack(const DeviceContext& dev_ctx, const framework::Tensor* LU,
   const auto W = udims[udims.size() - 1];
   auto L_dataptr = L->mutable_data<T>(dev_ctx.GetPlace());
   platform::ForRange<DeviceContext> x_for_range(dev_ctx, LU->numel());
-  TrilTriuCompute<T> tril_computer(LU->data<T>(), -1, true, H, W, L_dataptr);
+  phi::funcs::TrilTriuCompute<T> tril_computer(LU->data<T>(), -1, true, H, W,
+                                               L_dataptr);
   x_for_range(tril_computer);
 
-  TrilTriuCompute<T> triu_computer(LU->data<T>(), 0, false, H, W,
-                                   U->mutable_data<T>(dev_ctx.GetPlace()));
+  phi::funcs::TrilTriuCompute<T> triu_computer(
+      LU->data<T>(), 0, false, H, W, U->mutable_data<T>(dev_ctx.GetPlace()));
   x_for_range(triu_computer);
 
   // set L's diagonal 1
@@ -531,15 +533,15 @@ class LUGradKernel : public framework::OpKernel<T> {
     auto phil_rank = LmHdims.size();
     auto phiu_rank = UmHdims.size();
     platform::ForRange<DeviceContext> l_for_range(dev_ctx, phi_L.numel());
-    TrilTriuCompute<T> tril_computer(phi_L.data<T>(), -1, true,
-                                     LmHdims[phil_rank - 2],
-                                     LmHdims[phil_rank - 1], phi_L.data<T>());
+    phi::funcs::TrilTriuCompute<T> tril_computer(
+        phi_L.data<T>(), -1, true, LmHdims[phil_rank - 2],
+        LmHdims[phil_rank - 1], phi_L.data<T>());
     l_for_range(tril_computer);
 
     platform::ForRange<DeviceContext> u_for_range(dev_ctx, phi_U.numel());
-    TrilTriuCompute<T> triu_computer(phi_U.data<T>(), 0, false,
-                                     UmHdims[phiu_rank - 2],
-                                     UmHdims[phiu_rank - 1], phi_U.data<T>());
+    phi::funcs::TrilTriuCompute<T> triu_computer(
+        phi_U.data<T>(), 0, false, UmHdims[phiu_rank - 2],
+        UmHdims[phiu_rank - 1], phi_U.data<T>());
     u_for_range(triu_computer);
 
     Tensor_Add<DeviceContext, T>(dev_ctx, phi_L, phi_U, &phi);
@@ -555,6 +557,11 @@ class LUGradKernel : public framework::OpKernel<T> {
 
     framework::Tensor Pmat;
     Unpack_Pivot<DeviceContext, T>(dev_ctx, *P, &Pmat, m, k);
+
+    using Context =
+        typename framework::ConvertToPhiContext<DeviceContext>::TYPE;
+    auto& phi_dev_ctx = static_cast<const Context&>(dev_ctx);
+
     if (m <= n) {
       if (k < n) {
         framework::Tensor U_complement, U_grad_complement, phi_complement,
@@ -585,8 +592,9 @@ class LUGradKernel : public framework::OpKernel<T> {
         const auto W = phidims[phidims.size() - 1];
         platform::ForRange<DeviceContext> x_for_range(dev_ctx,
                                                       phi_complement.numel());
-        TrilTriuCompute<T> tril_computer(phi_complement.data<T>(), -1, true, H,
-                                         W, phi_complement_l.data<T>());
+        phi::funcs::TrilTriuCompute<T> tril_computer(
+            phi_complement.data<T>(), -1, true, H, W,
+            phi_complement_l.data<T>());
         x_for_range(tril_computer);
 
         Tensor_Sub<DeviceContext, T>(dev_ctx, phi, phi_complement_l, &phi);
@@ -605,8 +613,9 @@ class LUGradKernel : public framework::OpKernel<T> {
       framework::Tensor psi_principal, phi_mH, psi_tmp;
       Tensor_Conj<DeviceContext, T>(dev_ctx, phi, &phi_mH);
       phi_mH = helper.Transpose(phi_mH);
-      triangular_solve<DeviceContext, T>(dev_ctx, U_narrow, phi_mH,
-                                         &psi_principal, true, false, false);
+
+      phi::TriangularSolveKernel<T, Context>(
+          phi_dev_ctx, U_narrow, phi_mH, true, false, false, &psi_principal);
 
       Tensor_Conj<DeviceContext, T>(dev_ctx, psi_principal, &psi_principal);
       psi_principal = helper.Transpose(psi_principal);
@@ -620,8 +629,9 @@ class LUGradKernel : public framework::OpKernel<T> {
       SetValueCompute_dispatch<DeviceContext, T>(ctx, &psi, &psi_principal,
                                                  &psi, axes, &slice_starts,
                                                  &slice_ends, valuedims, xrank);
-      triangular_solve<DeviceContext, T>(dev_ctx, L_narrow_mH, psi, &psi_tmp,
-                                         true, false, true);
+
+      phi::TriangularSolveKernel<T, Context>(phi_dev_ctx, L_narrow_mH, psi,
+                                             true, false, true, &psi_tmp);
 
       auto mat_dim_p =
           phi::funcs::CreateMatrixDescriptor(Pmat.dims(), 0, false);
@@ -656,8 +666,8 @@ class LUGradKernel : public framework::OpKernel<T> {
       const auto W = phidims[phidims.size() - 1];
       platform::ForRange<DeviceContext> x_for_range(dev_ctx,
                                                     phi_complement.numel());
-      TrilTriuCompute<T> triu_computer(phi_complement.data<T>(), 0, false, H, W,
-                                       phi_complement_u.data<T>());
+      phi::funcs::TrilTriuCompute<T> triu_computer(
+          phi_complement.data<T>(), 0, false, H, W, phi_complement_u.data<T>());
       x_for_range(triu_computer);
 
       Tensor_Sub<DeviceContext, T>(dev_ctx, phi, phi_complement_u, &phi);
@@ -672,8 +682,10 @@ class LUGradKernel : public framework::OpKernel<T> {
                                                  &psi, axes, &slice_starts,
                                                  &slice_ends, valuedims, xrank);
       framework::Tensor psi_principal, phi_mH, psi_tmp, U_narrow_mH;
-      triangular_solve<DeviceContext, T>(dev_ctx, L_narrow_mH, phi,
-                                         &psi_principal, true, false, true);
+
+      phi::TriangularSolveKernel<T, Context>(phi_dev_ctx, L_narrow_mH, phi,
+                                             true, false, true, &psi_principal);
+
       slice_starts[0] = 0;
       slice_starts[1] = 0;
       slice_ends[0] = k;
@@ -695,8 +707,8 @@ class LUGradKernel : public framework::OpKernel<T> {
       psi_tmp = helper.Transpose(psi_tmp);
 
       Tensor_Conj<DeviceContext, T>(dev_ctx, U_narrow, &U_narrow_mH);
-      triangular_solve<DeviceContext, T>(dev_ctx, U_narrow_mH, psi_tmp, &psi,
-                                         true, false, false);
+      phi::TriangularSolveKernel<T, Context>(phi_dev_ctx, U_narrow_mH, psi_tmp,
+                                             true, false, false, &psi);
       *dx = helper.Transpose(psi);
     }
   }
diff --git a/paddle/fluid/operators/lu_unpack_op.h b/paddle/fluid/operators/lu_unpack_op.h
index d2303f2c08da8..e4100867dc685 100644
--- a/paddle/fluid/operators/lu_unpack_op.h
+++ b/paddle/fluid/operators/lu_unpack_op.h
@@ -16,7 +16,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/lu_op.h"
-#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
 
 namespace paddle {
 namespace operators {
@@ -87,7 +88,8 @@ class LU_UnpackGradKernel : public framework::OpKernel<T> {
     auto W = ldims[ldims.size() - 1];
     auto L_dataptr = dl_tril.mutable_data<T>(dev_ctx.GetPlace());
     platform::ForRange<DeviceContext> l_for_range(dev_ctx, dl->numel());
-    TrilTriuCompute<T> tril_computer(dl->data<T>(), -1, true, H, W, L_dataptr);
+    phi::funcs::TrilTriuCompute<T> tril_computer(dl->data<T>(), -1, true, H, W,
+                                                 L_dataptr);
     l_for_range(tril_computer);
 
     const auto udims = du->dims();
@@ -96,7 +98,8 @@ class LU_UnpackGradKernel : public framework::OpKernel<T> {
     W = udims[udims.size() - 1];
     auto U_dataptr = du_triu.mutable_data<T>(dev_ctx.GetPlace());
     platform::ForRange<DeviceContext> u_for_range(dev_ctx, du->numel());
-    TrilTriuCompute<T> triu_computer(du->data<T>(), 0, false, H, W, U_dataptr);
+    phi::funcs::TrilTriuCompute<T> triu_computer(du->data<T>(), 0, false, H, W,
+                                                 U_dataptr);
     u_for_range(triu_computer);
 
     auto xdims = dx->dims();
diff --git a/paddle/fluid/operators/masked_select_op.cc b/paddle/fluid/operators/masked_select_op.cc
index a6eb535c693b8..1887bbcfb7efd 100644
--- a/paddle/fluid/operators/masked_select_op.cc
+++ b/paddle/fluid/operators/masked_select_op.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,16 +23,6 @@ class MaskedSelectOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "Input", "MaskedSelect");
-    OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "MaskedSelect");
-    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Out", "MaskedSelect");
-
-    // output will only be a 1-D Tensor
-    ctx->SetOutputDim("Y", phi::make_ddim({-1}));
-    ctx->ShareLoD("X", /*->*/ "Y");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -100,8 +92,13 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(MaskedSelectedGradNoNeedBufferVarsInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(masked_select, MaksedSelectInferShapeFunctor,
+                            PD_INFER_META(phi::MaskedSelectInferMeta));
+
 REGISTER_OPERATOR(masked_select, ops::MaskedSelectOp, ops::MaskedSelectOpMaker,
                   ops::MaskedSelectGradOpMaker<paddle::framework::OpDesc>,
-                  ops::MaskedSelectGradOpMaker<paddle::imperative::OpBase>);
+                  ops::MaskedSelectGradOpMaker<paddle::imperative::OpBase>,
+                  MaksedSelectInferShapeFunctor);
 REGISTER_OPERATOR(masked_select_grad, ops::MaskedSelectOpGrad,
                   ops::MaskedSelectedGradNoNeedBufferVarsInferer);
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index d5a86d62b417c..af1069cb86799 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -20,7 +20,6 @@ math_library(sampler DEPS generator)
 
 # math_library(math_function DEPS blas dense_tensor tensor)
 math_library(maxouting)
-math_library(pooling)
 
 if(WITH_MKLDNN)
     math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler)
@@ -46,7 +45,6 @@ math_library(vol2col)
 math_library(prelu)
 math_library(bert_encoder_functor)
 math_library(tree2col DEPS math_function)
-math_library(segment_pooling)
 math_library(matrix_solve)
 
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
index c9308d27c0a34..e1861b2f7c5ea 100644
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -243,8 +243,6 @@ class ConcatFunctor<platform::MLUDeviceContext, T> {
 
     const int axis_t = axis;
     const int ins_size_t = ins_size;
-    auto place = context.GetPlace();
-    output->mutable_data<T>(place);
 
     // mlu should do sth
     // init ins tensors
@@ -295,7 +293,6 @@ class SplitFunctor<platform::MLUDeviceContext, T> {
     std::vector<cnnlTensorDescriptor_t> desc_vector;
     for (size_t i = 0; i < out_size; i++) {
       (*outputs)[i]->Resize(outs_dims[i]);
-      (*outputs)[i]->mutable_data<T>(context.GetPlace());
       output_descs.emplace_back(
           MLUCnnlTensorDesc(*(*outputs)[i], CNNL_LAYOUT_ARRAY,
                             ToCnnlDataType((*outputs)[i]->dtype())));
diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h
index 9994ccc10cb13..b77e23450360c 100644
--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -34,10 +34,10 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename InputIterator, typename OutputIterator, typename BinaryOp>
+template <typename InputIterator, typename OutputIterator, typename BinaryOp,
+          typename Context>
 static void CubInclusiveScan(InputIterator x_iter, OutputIterator y_iter,
-                             size_t n, BinaryOp op,
-                             const platform::CUDADeviceContext &dev_ctx) {
+                             size_t n, BinaryOp op, const Context &dev_ctx) {
   memory::AllocationPtr allocation;
   void *temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
@@ -185,11 +185,10 @@ static __global__ void InclusiveScanInnerDimCUDAKernel(const T *x, T *y,
   }
 }
 
-template <typename T, typename BinaryOp>
+template <typename T, typename BinaryOp, typename Context>
 static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim,
                                   size_t inner_dim, T init, BinaryOp op,
-                                  bool reverse,
-                                  const platform::CUDADeviceContext &dev_ctx) {
+                                  bool reverse, const Context &dev_ctx) {
   constexpr size_t kThreadNumX = 16;
   constexpr size_t kThreadNumY = 32;
 
@@ -209,10 +208,10 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim,
   }
 }
 
-template <typename T, typename BinaryOp>
+template <typename T, typename BinaryOp, typename Context>
 void InclusiveScan(const T *x, T *y, size_t outer_dim, size_t mid_dim,
                    size_t inner_dim, T init, BinaryOp op, bool reverse,
-                   const platform::CUDADeviceContext &dev_ctx) {
+                   const Context &dev_ctx) {
   if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return;
 
   if (outer_dim == 1 && inner_dim == 1) {
@@ -224,8 +223,7 @@ void InclusiveScan(const T *x, T *y, size_t outer_dim, size_t mid_dim,
       CubInclusiveScan(x, y, mid_dim, op, dev_ctx);
     }
   } else if (inner_dim != 1) {
-    platform::ForRange<platform::CUDADeviceContext> for_range(
-        dev_ctx, outer_dim * inner_dim);
+    platform::ForRange<Context> for_range(dev_ctx, outer_dim * inner_dim);
     if (reverse) {
       for_range(
           InclusiveScanOuterOrMidDimFunctor<T, BinaryOp, /*kReverse=*/true>(
diff --git a/paddle/fluid/operators/math/matrix_solve.cc b/paddle/fluid/operators/math/matrix_solve.cc
index 883ee9b148654..7b239b8166644 100644
--- a/paddle/fluid/operators/math/matrix_solve.cc
+++ b/paddle/fluid/operators/math/matrix_solve.cc
@@ -34,45 +34,6 @@ class MatrixSolveFunctor<platform::CPUDeviceContext, T> {
 template class MatrixSolveFunctor<platform::CPUDeviceContext, float>;
 template class MatrixSolveFunctor<platform::CPUDeviceContext, double>;
 
-template <typename T>
-class TriangularSolveFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor* a, framework::Tensor* b, bool left,
-                  bool upper, bool transpose, bool unitriangular) {
-    CBLAS_SIDE side = left ? CblasLeft : CblasRight;
-    CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower;
-    CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans;
-    CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit;
-
-    const T* a_data = a->data<T>();
-    T* b_data = b->mutable_data<T>(context.GetPlace());
-
-    int a_dim_size = a->dims().size();
-    int b_dim_size = b->dims().size();
-
-    int M = static_cast<int>(b->dims()[b_dim_size - 2]);
-    int N = static_cast<int>(b->dims()[b_dim_size - 1]);
-    auto lda = left ? std::max(1, M) : std::max(1, N);
-    auto ldb = std::max(1, N);
-
-    int batch_size = 1;
-    auto& a_dim = a->dims();
-    for (int i = 0; i < a_dim_size - 2; i++) {
-      batch_size *= a_dim[i];
-    }
-
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
-    for (int i = 0; i < batch_size; i++) {
-      blas.TRSM(side, uplo, transA, diag, M, N, T(1), a_data + i * M * M, lda,
-                b_data + i * N * M, ldb);
-    }
-  }
-};
-
-template class TriangularSolveFunctor<platform::CPUDeviceContext, float>;
-template class TriangularSolveFunctor<platform::CPUDeviceContext, double>;
-
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_solve.cu.cc b/paddle/fluid/operators/math/matrix_solve.cu.cc
index d3490ead21273..737196dde1dfc 100644
--- a/paddle/fluid/operators/math/matrix_solve.cu.cc
+++ b/paddle/fluid/operators/math/matrix_solve.cu.cc
@@ -161,67 +161,6 @@ class MatrixSolveFunctor<platform::CUDADeviceContext, T> {
 template class MatrixSolveFunctor<platform::CUDADeviceContext, float>;
 template class MatrixSolveFunctor<platform::CUDADeviceContext, double>;
 
-template <typename T>
-class TriangularSolveFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context, const Tensor* a,
-                  Tensor* b, bool left, bool upper, bool transpose,
-                  bool unitriangular) {
-    CBLAS_SIDE side = left ? CblasLeft : CblasRight;
-    CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower;
-    CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans;
-    CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit;
-
-    const T* a_data = a->data<T>();
-    T* b_data = b->mutable_data<T>(context.GetPlace());
-
-    int a_dim_size = a->dims().size();
-    int b_dim_size = b->dims().size();
-
-    int M = static_cast<int>(b->dims()[b_dim_size - 2]);
-    int N = static_cast<int>(b->dims()[b_dim_size - 1]);
-    auto lda = left ? std::max(1, M) : std::max(1, N);
-    auto ldb = std::max(1, N);
-
-    int batch_size = 1;
-    auto& a_dim = a->dims();
-    for (int i = 0; i < a_dim_size - 2; i++) {
-      batch_size *= a_dim[i];
-    }
-
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
-    if (batch_size <= 8 && M >= 64) {
-      for (auto i = 0; i < batch_size; i++) {
-        blas.TRSM(side, uplo, transA, diag, M, N, static_cast<T>(1.0),
-                  a_data + i * M * M, lda, b_data + i * N * M, ldb);
-      }
-    } else {
-      std::vector<const T*> cpu_ptrs(batch_size * 2);
-      for (int i = 0; i < batch_size; ++i) {
-        cpu_ptrs[i] = a_data + i * M * M;
-        cpu_ptrs[i + batch_size] = b_data + i * M * N;
-      }
-
-      // Copy the addresses of A and tmp_b from host to device.
-      memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
-          memory::Alloc(context, cpu_ptrs.size() * sizeof(T*));
-      memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(),
-                   platform::CPUPlace(), static_cast<void*>(cpu_ptrs.data()),
-                   cpu_ptrs.size() * sizeof(T*), context.stream());
-
-      const T** gpu_a_ptrs =
-          reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr());
-      T** gpu_b_ptrs =
-          reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
-      blas.BatchedTRSM(side, uplo, transA, diag, M, N, static_cast<T>(1.0),
-                       gpu_a_ptrs, lda, gpu_b_ptrs, ldb, batch_size);
-    }
-  }
-};
-
-template class TriangularSolveFunctor<platform::CUDADeviceContext, float>;
-template class TriangularSolveFunctor<platform::CUDADeviceContext, double>;
-
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_solve.h b/paddle/fluid/operators/math/matrix_solve.h
index 1dc43205592f6..415d0c6dd8e0c 100644
--- a/paddle/fluid/operators/math/matrix_solve.h
+++ b/paddle/fluid/operators/math/matrix_solve.h
@@ -117,14 +117,6 @@ class MatrixSolveFunctor {
                   const framework::Tensor& b, framework::Tensor* out);
 };
 
-template <typename DeviceContext, typename T>
-class TriangularSolveFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor* a,
-                  framework::Tensor* b, bool left, bool upper, bool transpose,
-                  bool unitriangular);
-};
-
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
deleted file mode 100644
index dfd3dad38644b..0000000000000
--- a/paddle/fluid/operators/math/pooling.h
+++ /dev/null
@@ -1,315 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/*
- * \brief Extracting simple operations from pooling.
- *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
- * operation.
- *        MaxPool initializes temp variable to the negative maximum to find the
- * maximum value in the pooling field.
- *        AvgPool initializes temp variable to the zero to accumulate all values
- * in pool pooling, and finally takes the average.
- *        MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
- */
-template <class T>
-class MaxPool {
- public:
-  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
-  DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
-  DEVICE inline void finalize(const T& pool_field, T* y) {}
-};
-
-template <class T>
-class AvgPool {
-  using MT = typename details::MPTypeTrait<T>::Type;
-  MT intermediate_res;
-
- public:
-  DEVICE inline T initial() {
-    intermediate_res = static_cast<MT>(0.0f);
-    return static_cast<T>(0);
-  }
-
-  DEVICE inline void compute(const T& x, T* y) {
-    intermediate_res += static_cast<MT>(x);
-  }
-
-  DEVICE inline void finalize(const T& pool_field, T* y) {
-    *y = static_cast<T>(intermediate_res / (static_cast<MT>(pool_field)));
-  }
-};
-
-template <class T>
-class MaxPoolGrad {
- public:
-  static constexpr bool use_x = true;
-  HOSTDEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
-                                 T* dx) {
-    *dx += dy * static_cast<T>(x == y);
-  }
-};
-
-template <class T>
-class AvgPoolGrad {
- public:
-  static constexpr bool use_x = false;
-  HOSTDEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
-                                 T* dx) {
-    *dx += (scale * dy);
-  }
-};
-
-/* used for adaptive pool to calculate start and end index of each divided grid
- */
-HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) {
-  return static_cast<int>(
-      floor(static_cast<double>(ph * input_size) / output_size));
-}
-
-HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
-  return static_cast<int>(
-      ceil(static_cast<double>((ph + 1) * input_size) / output_size));
-}
-
-/*
- * \brief Getting pooling results, and calculating gradient.
- *
- * In pool2d, all Tensors are in NCHW or NHWC format. Where N is batch size, C
- * is the number of channels, H and W is the height and width of feature.
- * In pool3d, all Tensors are in NCDHW or NDHWC format. Where N is batch size, C
- * is the number of channels, D, H and W is the depth, height and width of
- * feature.
- *
- * In max pooling, it is possible that the pooling region has multiple maximum
- * elements. In this case, we should compute the gradient of the first maximum
- * element.
- * This is different from average pooling. So we rewrite the max_pool_grad:
- * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
- */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename PoolProcess, typename T>
-class Pool2dDirectCUDAFunctor {
- public:
-  void operator()(const T* input, const std::vector<int>& input_shape,
-                  const std::vector<int>& output_shape,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, T* output, gpuStream_t stream,
-                  PoolProcess pool_compute);
-};
-#endif
-
-template <typename DeviceContext, typename PoolProcess, typename T>
-class Pool2dFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* output,
-                  PoolProcess pool_compute);
-
-  // overload operator() to support argument data_format
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* output, PoolProcess pool_compute);
-};
-
-template <typename DeviceContext, typename PoolProcess, typename T>
-class Pool2dGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* input_grad,
-                  PoolProcess pool_compute);
-  // overload operator() to support argument data_format
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* input_grad, PoolProcess pool_compute);
-};
-
-template <typename DeviceContext, class T>
-class MaxPool2dGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* input_grad);
-  // overload operator() to support argument data_format
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::string data_format, framework::Tensor* input_grad);
-};
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename PoolProcess, typename T>
-class Pool3dDirectCUDAFunctor {
- public:
-  void operator()(const T* input, const std::vector<int>& input_shape,
-                  const std::vector<int>& output_shape,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, T* output, gpuStream_t stream,
-                  PoolProcess pool_compute);
-};
-#endif
-
-template <typename DeviceContext, typename PoolProcess, typename T>
-class Pool3dFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* output,
-                  PoolProcess pool_compute);
-  // overload operator() to support argument data_format
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* output, PoolProcess pool_compute);
-};
-
-template <typename DeviceContext, typename PoolProcess, typename T>
-class Pool3dGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* input_grad,
-                  PoolProcess pool_compute);
-  // overload operator() to support argument data_format
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* input_grad, PoolProcess pool_compute);
-};
-
-template <typename DeviceContext, class T>
-class MaxPool3dGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* input_grad);
-  // overload operator() to support argument data_format
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::string data_format, framework::Tensor* input_grad);
-};
-
-/*
- * \brief Getting max pooling results and corresponding max index, and
- * calculating gradient.
- * In up-sampling-pooling, it is necessary to know max element index.
- * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in
- * NCDHW format.
- */
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPool2dWithIndexFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask);
-};
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor {
- public:
-  void operator()(const DeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad);
-};
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPool3dWithIndexFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask);
-};
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor {
- public:
-  void operator()(const DeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc
index 1524a50f1ac6d..87df75ac46504 100644
--- a/paddle/fluid/operators/matmul_v2_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc
@@ -38,7 +38,7 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
   auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(
       ColumnMatrixFromVector(y_dims), 0, trans_y);
 
-  if (x_dims.size() == 3 && y_dims.size() <= 2) {
+  if (x_dims.size() >= 3 && y_dims.size() <= 2) {
     // if transpose_X is true, the transpose cost much time
     if (!trans_x) {
       mat_dim_a.height_ *= mat_dim_a.batch_size_;
diff --git a/paddle/fluid/operators/matrix_power_op.cc b/paddle/fluid/operators/matrix_power_op.cc
index c65af3129f364..56f65340ea999 100644
--- a/paddle/fluid/operators/matrix_power_op.cc
+++ b/paddle/fluid/operators/matrix_power_op.cc
@@ -12,7 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/matrix_power_op.h"
+#include <memory>
+#include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,26 +26,6 @@ namespace operators {
 class MatrixPowerOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "matrix_power");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "matrix_power");
-    auto dims = ctx->GetInputDim("X");
-    auto n_dim = dims.size();
-    PADDLE_ENFORCE_GE(n_dim, 2,
-                      platform::errors::InvalidArgument(
-                          "The Input(X) should have at least 2 dimensions. But "
-                          "received a %d dimension tensor.",
-                          n_dim));
-    PADDLE_ENFORCE_EQ(dims[n_dim - 2], dims[n_dim - 1],
-                      platform::errors::InvalidArgument(
-                          "The inner-most 2 dimensions of Input(X) all should "
-                          "be square matrices "
-                          "But received X's shape[-2] = %d and shape[-1] = %d.",
-                          dims[n_dim - 2], dims[n_dim - 1]));
-    ctx->SetOutputDim("Out", dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class MatrixPowerOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -113,19 +99,14 @@ class MatrixPowerGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(matrix_power, MatrixPowerInferShapeFunctor,
+                            PD_INFER_META(phi::MatrixPowerInferMeta));
+
 REGISTER_OPERATOR(matrix_power, ops::MatrixPowerOp, ops::MatrixPowerOpMaker,
                   ops::MatrixPowerOpInferVarType,
                   ops::MatrixPowerGradOpMaker<paddle::framework::OpDesc>,
-                  ops::MatrixPowerGradOpMaker<paddle::imperative::OpBase>);
+                  ops::MatrixPowerGradOpMaker<paddle::imperative::OpBase>,
+                  MatrixPowerInferShapeFunctor);
 
 REGISTER_OPERATOR(matrix_power_grad, ops::MatrixPowerGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    matrix_power,
-    ops::MatrixPowerKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MatrixPowerKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    matrix_power_grad,
-    ops::MatrixPowerGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MatrixPowerGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/matrix_power_op.cu b/paddle/fluid/operators/matrix_power_op.cu
deleted file mode 100644
index d972e9499dc88..0000000000000
--- a/paddle/fluid/operators/matrix_power_op.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/matrix_power_op.h"
-
-namespace ops = paddle::operators;
-namespace plf = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(matrix_power,
-                        ops::MatrixPowerKernel<plf::CUDADeviceContext, float>,
-                        ops::MatrixPowerKernel<plf::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    matrix_power_grad,
-    ops::MatrixPowerGradKernel<plf::CUDADeviceContext, float>,
-    ops::MatrixPowerGradKernel<plf::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/matrix_power_op.h b/paddle/fluid/operators/matrix_power_op.h
deleted file mode 100644
index 8eb9c58513df6..0000000000000
--- a/paddle/fluid/operators/matrix_power_op.h
+++ /dev/null
@@ -1,277 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/matrix_inverse.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-struct IdentityMatrixFunctor {
-  IdentityMatrixFunctor(const int m, T* output) : m_(m), output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int row = index / m_ % m_;
-    const int col = index % m_;
-    output_[index] = col == row ? static_cast<T>(1) : static_cast<T>(0);
-  }
-
-  const int m_;
-  T* output_;
-};
-
-template <typename DeviceContext, typename T>
-void MatrixPowerFunction(const Tensor* X, const int n, Tensor* Out,
-                         const paddle::framework::ExecutionContext& ctx) {
-  const auto& x_dims = X->dims();
-  const int x_ndim = x_dims.size();
-  T* out_data = Out->mutable_data<T>(ctx.GetPlace());
-
-  auto& dev_ctx = ctx.template device_context<DeviceContext>();
-  platform::ForRange<DeviceContext> for_range(dev_ctx, X->numel());
-
-  if (n == 0) {
-    // Out = Identity Matrix
-    IdentityMatrixFunctor<T> functor(x_dims[x_ndim - 1], out_data);
-    for_range(functor);
-    return;
-  }
-
-  auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-  Tensor new_x = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  int new_n = n;
-  if (n > 0) {
-    // newX = X
-    framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x);
-  } else {
-    // newX = X^{-1}, n = -n
-    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(dev_ctx, *X, &new_x);
-    new_n = -n;
-  }
-
-  if (new_n == 1) {
-    framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, Out);
-    return;
-  }
-
-  auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false);
-
-  if (new_n == 2) {
-    // Out = newX * newX
-    Out->mutable_data<T>(ctx.GetPlace());
-    blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast<T>(1),
-                Out, static_cast<T>(0));
-    return;
-  } else if (new_n == 3) {
-    // Out = (newX * newX) * newX
-    // Note: C[i] matrices in MatMul must not overlap, i.e. the individual
-    // gemm operations must be computable independently; otherwise,
-    // undefined behavior is expected.
-    Tensor temp = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast<T>(1),
-                &temp, static_cast<T>(0));
-    blas.MatMul(temp, no_trans_desc, new_x, no_trans_desc, static_cast<T>(1),
-                Out, static_cast<T>(0));
-    return;
-  } else if (new_n == 4) {
-    // Out = (newX * newX) * (newX * newX)
-    Tensor temp = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast<T>(1),
-                &temp, static_cast<T>(0));
-    blas.MatMul(temp, no_trans_desc, temp, no_trans_desc, static_cast<T>(1),
-                Out, static_cast<T>(0));
-    return;
-  }
-
-  // Calculate Out = newX^{n} for abs(n) > 4 with time complexity as O(logN)
-  int bit = 0;
-  Tensor z = Tensor(X->dtype());
-  bool out_inited = false;
-  Tensor temp_out = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  Tensor temp_z = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  while (new_n > 0) {
-    bit = new_n & 0x1;
-    new_n >>= 1;
-    if (z.IsInitialized()) {
-      blas.MatMul(z, no_trans_desc, z, no_trans_desc, static_cast<T>(1),
-                  &temp_z, static_cast<T>(0));
-      framework::TensorCopy(temp_z, ctx.GetPlace(), dev_ctx, &z);
-    } else {
-      z = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-      framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, &z);
-    }
-    if (bit == 1) {
-      if (out_inited == true) {
-        blas.MatMul(*Out, no_trans_desc, z, no_trans_desc, static_cast<T>(1),
-                    &temp_out, static_cast<T>(0));
-        framework::TensorCopy(temp_out, ctx.GetPlace(), dev_ctx, Out);
-      } else {
-        framework::TensorCopy(z, ctx.GetPlace(), dev_ctx, Out);
-        out_inited = true;
-      }
-    }
-  }
-  return;
-}
-
-template <typename DeviceContext, typename T>
-class MatrixPowerKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    const Tensor* X = ctx.Input<Tensor>("X");
-    Tensor* Out = ctx.Output<Tensor>("Out");
-    int n = ctx.Attr<int>("n");
-
-    const auto& x_dims = X->dims();
-    const int x_ndim = x_dims.size();
-    PADDLE_ENFORCE_EQ(
-        x_dims[x_ndim - 2], x_dims[x_ndim - 1],
-        platform::errors::InvalidArgument(
-            "The inner-most 2 dimensions of Input(X) should be equal."
-            "X's shape[-2] = %d and shape[-1] = %d.",
-            x_dims[x_ndim - 2], x_dims[x_ndim - 1]));
-
-    MatrixPowerFunction<DeviceContext, T>(X, n, Out, ctx);
-  }
-};
-
-template <typename DeviceContext, typename T>
-void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out,
-                             const Tensor* dOut, const int n, Tensor* dX,
-                             const paddle::framework::ExecutionContext& ctx) {
-  dX->mutable_data<T>(ctx.GetPlace());
-  const auto& x_dims = X->dims();
-
-  auto& dev_ctx = ctx.template device_context<DeviceContext>();
-  auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-  if (n == 0) {
-    // \nabla X = O
-    phi::funcs::SetConstant<DeviceContext, T> zero;
-    zero(dev_ctx, dX, static_cast<T>(0));
-    return;
-  } else if (n == 1) {
-    // \nabla X = \nabla Out
-    framework::TensorCopy(*dOut, ctx.GetPlace(), dev_ctx, dX);
-    return;
-  }
-
-  auto trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, true);
-  auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false);
-
-  if (n == -1) {
-    // \nabla X = Out^{T} * \nabla Out * Out^{T}
-    Tensor temp_dx =
-        ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(*Out, trans_desc, *dOut, no_trans_desc, static_cast<T>(-1),
-                &temp_dx, static_cast<T>(0));
-    blas.MatMul(temp_dx, no_trans_desc, *Out, trans_desc, static_cast<T>(1), dX,
-                static_cast<T>(0));
-    return;
-  }
-
-  Tensor new_x = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  int new_n = n;
-  if (n > 0) {
-    // newX = X
-    framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x);
-  } else {
-    // newX = X^{-1}, n = -n
-    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(dev_ctx, *X, &new_x);
-    new_n = -n;
-  }
-
-  // Use chain rule blow to compute \nabla newX^{n}
-  // First, Get newX^{0}, newX^{1}, ..., newX^{n - 1},
-  // Note that newX^{0} can be omitted
-  std::vector<std::shared_ptr<Tensor>> tensor_list(new_n - 1);
-  tensor_list[0] = std::make_shared<Tensor>(new_x);
-  int index = 1;
-  while (index < new_n - 1) {
-    tensor_list[index] = std::make_shared<Tensor>(
-        ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx));
-    blas.MatMul(*tensor_list[index - 1], no_trans_desc, new_x, no_trans_desc,
-                static_cast<T>(1), tensor_list[index].get(), static_cast<T>(0));
-    index++;
-  }
-
-  // Second, \nabla newX = \sum_{i = 0}^{n - 1} (newX^{T}^{i}
-  //                      * \nabla Out
-  //                      * (newX^{T}^{n - i - 1})
-  Tensor dx_new = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  blas.MatMul(*tensor_list[new_n - 2], trans_desc, *dOut, no_trans_desc,
-              static_cast<T>(1), &dx_new, static_cast<T>(0));
-  Tensor da_an_minus1 =
-      ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  blas.MatMul(*dOut, no_trans_desc, *tensor_list[new_n - 2], trans_desc,
-              static_cast<T>(1), &da_an_minus1, static_cast<T>(0));
-  blas.AXPY(X->numel(), static_cast<T>(1), da_an_minus1.data<T>(),
-            dx_new.data<T>());
-  int start = 0;
-  while (start < new_n - 2) {
-    Tensor a_da = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    Tensor a_da_a = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(*tensor_list[start], trans_desc, *dOut, no_trans_desc,
-                static_cast<T>(1), &a_da, static_cast<T>(0));
-    blas.MatMul(a_da, no_trans_desc, *tensor_list[new_n - 3 - start],
-                trans_desc, static_cast<T>(1), &a_da_a, static_cast<T>(0));
-    blas.AXPY(X->numel(), static_cast<T>(1), a_da_a.data<T>(),
-              dx_new.data<T>());
-    start++;
-  }
-
-  if (n > 0) {
-    // \nabla X = \nabla newX
-    framework::TensorCopy(dx_new, ctx.GetPlace(), dev_ctx, dX);
-  } else {
-    // \nabla X = newX^{T} * \nabla newX * newX^{T}
-    Tensor temp_dx =
-        ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(new_x, trans_desc, dx_new, no_trans_desc, static_cast<T>(-1),
-                &temp_dx, static_cast<T>(0));
-    blas.MatMul(temp_dx, no_trans_desc, new_x, trans_desc, static_cast<T>(1),
-                dX, static_cast<T>(0));
-  }
-  return;
-}
-
-template <typename DeviceContext, typename T>
-class MatrixPowerGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* X = ctx.Input<Tensor>("X");
-    const Tensor* Out = ctx.Input<Tensor>("Out");
-    const Tensor* dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    const int n = ctx.Attr<int>("n");
-    Tensor* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    MatrixPowerGradFunction<DeviceContext, T>(X, Out, dOut, n, dX, ctx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/matrix_rank_op.cc b/paddle/fluid/operators/matrix_rank_op.cc
index 1f04875c2203b..e7d08b6597360 100644
--- a/paddle/fluid/operators/matrix_rank_op.cc
+++ b/paddle/fluid/operators/matrix_rank_op.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/matrix_rank_op.h"
 #include <memory>
 #include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
@@ -70,9 +69,9 @@ class MatrixRankeOp : public framework::OperatorWithKernel {
         std::vector<int> x_batch_dims_array(max_dim);
         std::vector<int> tol_dims_array(max_dim);
         std::vector<int> out_dims_array(max_dim);
-        GetBroadcastDimsArrays(dim_x_batch, dim_tol, x_batch_dims_array.data(),
-                               tol_dims_array.data(), out_dims_array.data(),
-                               max_dim, axis);
+        phi::funcs::GetBroadcastDimsArrays(
+            dim_x_batch, dim_tol, x_batch_dims_array.data(),
+            tol_dims_array.data(), out_dims_array.data(), max_dim, axis);
         ctx->SetOutputDim("Out", phi::make_ddim(out_dims_array));
       }
     } else {
@@ -115,141 +114,9 @@ class MatrixRankeOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-template <typename T>
-void BatchEigenvalues(const T* x_data, T* eigenvalues_data, int batches,
-                      int rows, int cols, int k) {
-  // Eigen::Matrix API need non-const pointer.
-  T* input = const_cast<T*>(x_data);
-  int stride = rows * cols;
-  for (int i = 0; i < batches; i++) {
-    auto m = Eigen::Map<
-        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
-        input + i * stride, rows, rows);
-    Eigen::SelfAdjointEigenSolver<
-        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
-        eigen_solver(m);
-    auto eigenvalues = eigen_solver.eigenvalues().cwiseAbs();
-    for (int j = 0; j < k; j++) {
-      *(eigenvalues_data + i * k + j) = eigenvalues[j];
-    }
-  }
-}
-
-template <typename T>
-void BatchSVD(const T* x_data, T* eigenvalues_data, int batches, int rows,
-              int cols, int k) {
-  // Eigen::Matrix API need non-const pointer.
-  T* input = const_cast<T*>(x_data);
-  int stride = rows * cols;
-  Eigen::BDCSVD<
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
-      svd;
-  for (int i = 0; i < batches; i++) {
-    auto m = Eigen::Map<
-        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
-        input + i * stride, rows, cols);
-    svd.compute(m);
-    auto res_s = svd.singularValues();
-    for (int j = 0; j < k; j++) {
-      eigenvalues_data[i * k + j] = res_s[j];
-    }
-  }
-}
-
-template <typename T>
-class MatrixRankCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    auto* x_data = x->data<T>();
-    auto* out = context.Output<Tensor>("Out");
-    out->mutable_data<int64_t>(context.GetPlace());
-    bool hermitian = context.Attr<bool>("hermitian");
-
-    auto dim_x = x->dims();
-    auto dim_out = out->dims();
-    int rows = dim_x[dim_x.size() - 2];
-    int cols = dim_x[dim_x.size() - 1];
-    int k = std::min(rows, cols);
-    auto numel = x->numel();
-    int batches = numel / (rows * cols);
-
-    bool use_default_tol = context.Attr<bool>("use_default_tol");
-    const Tensor* atol_tensor = nullptr;
-    Tensor temp_tensor;
-    T rtol_T = 0;
-    if (use_default_tol) {
-      framework::TensorFromVector<T>(std::vector<T>{0},
-                                     context.device_context(), &temp_tensor);
-      atol_tensor = &temp_tensor;
-      rtol_T = std::numeric_limits<T>::epsilon() * std::max(rows, cols);
-    } else if (context.HasInput("TolTensor")) {
-      atol_tensor = context.Input<Tensor>("TolTensor");
-    } else {
-      framework::TensorFromVector<T>(std::vector<T>{context.Attr<float>("tol")},
-                                     context.device_context(), &temp_tensor);
-      atol_tensor = &temp_tensor;
-    }
-
-    Tensor eigenvalue_tensor;
-    auto* eigenvalue_data = eigenvalue_tensor.mutable_data<T>(
-        detail::GetEigenvalueDim(dim_x, k), context.GetPlace());
-    if (hermitian) {
-      BatchEigenvalues<T>(x_data, eigenvalue_data, batches, rows, cols, k);
-    } else {
-      BatchSVD<T>(x_data, eigenvalue_data, batches, rows, cols, k);
-    }
-
-    auto dito_T =
-        math::DeviceIndependenceTensorOperations<platform::CPUDeviceContext, T>(
-            context);
-    std::vector<int> max_eigenvalue_shape =
-        phi::vectorize<int>(detail::RemoveLastDim(eigenvalue_tensor.dims()));
-    Tensor max_eigenvalue_tensor =
-        dito_T.ReduceMax(eigenvalue_tensor, max_eigenvalue_shape);
-
-    Tensor temp_rtol_tensor;
-    framework::TensorFromVector<T>(std::vector<T>{rtol_T}, &temp_rtol_tensor);
-    Tensor rtol_tensor = dito_T.Mul(temp_rtol_tensor, max_eigenvalue_tensor);
-    Tensor tol_tensor;
-    tol_tensor.mutable_data<T>(dim_out, context.GetPlace());
-    ElementwiseComputeEx<GreaterElementFunctor<T>, platform::CPUDeviceContext,
-                         T, T>(context, atol_tensor, &rtol_tensor, -1,
-                               GreaterElementFunctor<T>(), &tol_tensor);
-
-    tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1));
-
-    Tensor compare_result;
-    compare_result.mutable_data<int64_t>(detail::NewAxisDim(dim_out, k),
-                                         context.GetPlace());
-
-    int axis = -1;
-    if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) {
-      ElementwiseComputeEx<phi::funcs::GreaterThanFunctor<T, int64_t>,
-                           platform::CPUDeviceContext, T, int>(
-          context, &eigenvalue_tensor, &tol_tensor, axis,
-          phi::funcs::GreaterThanFunctor<T, int64_t>(), &compare_result);
-    } else {
-      ElementwiseComputeEx<phi::funcs::LessThanFunctor<T, int64_t>,
-                           platform::CPUDeviceContext, T, int>(
-          context, &eigenvalue_tensor, &tol_tensor, axis,
-          phi::funcs::LessThanFunctor<T, int64_t>(), &compare_result);
-    }
-    auto dito_int =
-        math::DeviceIndependenceTensorOperations<platform::CPUDeviceContext,
-                                                 int64_t>(context);
-    std::vector<int> result_shape = phi::vectorize<int>(dim_out);
-    Tensor result = dito_int.ReduceSum(compare_result, result_shape);
-    out->ShareDataWith(result);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(matrix_rank, ops::MatrixRankeOp, ops::MatrixRankeOpMaker);
-
-REGISTER_OP_CPU_KERNEL(matrix_rank, ops::MatrixRankCPUKernel<float>,
-                       ops::MatrixRankCPUKernel<double>);
diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu
deleted file mode 100644
index dccd716022d2a..0000000000000
--- a/paddle/fluid/operators/matrix_rank_op.cu
+++ /dev/null
@@ -1,316 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_WITH_HIP
-// HIP not support cusolver
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/matrix_rank_op.h"
-#include "paddle/fluid/operators/svd_helper.h"
-#include "paddle/fluid/platform/dynload/cusolver.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/compare_functors.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-DDim GetUDDim(const DDim& x_dim, int k) {
-  auto x_vec = phi::vectorize(x_dim);
-  x_vec[x_vec.size() - 1] = k;
-  return phi::make_ddim(x_vec);
-}
-
-DDim GetVHDDim(const DDim& x_dim, int k) {
-  auto x_vec = phi::vectorize(x_dim);
-  x_vec[x_vec.size() - 2] = k;
-  return phi::make_ddim(x_vec);
-}
-}  // namespace detail
-
-template <typename T>
-class MatrixRankGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-
-    const Tensor* x = context.Input<Tensor>("X");
-    auto* x_data = x->data<T>();
-    auto* out = context.Output<Tensor>("Out");
-    out->mutable_data<int64_t>(context.GetPlace());
-    bool hermitian = context.Attr<bool>("hermitian");
-
-    auto dim_x = x->dims();
-    auto dim_out = out->dims();
-    int rows = dim_x[dim_x.size() - 2];
-    int cols = dim_x[dim_x.size() - 1];
-    int k = std::min(rows, cols);
-    auto numel = x->numel();
-    int batches = numel / (rows * cols);
-
-    bool use_default_tol = context.Attr<bool>("use_default_tol");
-    const Tensor* atol_tensor = nullptr;
-    Tensor temp_tensor;
-    T rtol_T = 0;
-    if (use_default_tol) {
-      framework::TensorFromVector<T>(std::vector<T>{0},
-                                     context.device_context(), &temp_tensor);
-      atol_tensor = &temp_tensor;
-      rtol_T = std::numeric_limits<T>::epsilon() * std::max(rows, cols);
-    } else if (context.HasInput("TolTensor")) {
-      atol_tensor = context.Input<Tensor>("TolTensor");
-    } else {
-      framework::TensorFromVector<T>(std::vector<T>{context.Attr<float>("tol")},
-                                     context.device_context(), &temp_tensor);
-      atol_tensor = &temp_tensor;
-    }
-
-    // Must Copy X once, because the gesvdj will destory the content when exit.
-    Tensor x_tmp;
-    paddle::framework::TensorCopy(*x, context.GetPlace(), &x_tmp);
-    auto info = memory::Alloc(dev_ctx, sizeof(int) * batches);
-    int* info_ptr = reinterpret_cast<int*>(info->ptr());
-
-    Tensor eigenvalue_tensor;
-    auto* eigenvalue_data = eigenvalue_tensor.mutable_data<T>(
-        detail::GetEigenvalueDim(dim_x, k), context.GetPlace());
-    if (hermitian) {
-      SyevjBatched(dev_ctx, batches, rows, x_tmp.data<T>(), eigenvalue_data,
-                   info_ptr);
-      platform::ForRange<platform::CUDADeviceContext> for_range(
-          dev_ctx, eigenvalue_tensor.numel());
-      phi::funcs::AbsFunctor<T> functor(eigenvalue_data, eigenvalue_data,
-                                        eigenvalue_tensor.numel());
-      for_range(functor);
-    } else {
-      Tensor U, VH;
-      auto* u_data =
-          U.mutable_data<T>(detail::GetUDDim(dim_x, k), context.GetPlace());
-      auto* vh_data =
-          VH.mutable_data<T>(detail::GetVHDDim(dim_x, k), context.GetPlace());
-      GesvdjBatched(dev_ctx, batches, cols, rows, k, x_tmp.data<T>(), vh_data,
-                    u_data, eigenvalue_data, info_ptr, 1);
-    }
-
-    auto dito_T =
-        math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
-                                                 T>(context);
-    std::vector<int> max_eigenvalue_shape =
-        phi::vectorize<int>(detail::RemoveLastDim(eigenvalue_tensor.dims()));
-    Tensor max_eigenvalue_tensor =
-        dito_T.ReduceMax(eigenvalue_tensor, max_eigenvalue_shape);
-    Tensor temp_rtol_tensor;
-    framework::TensorFromVector<T>(std::vector<T>{rtol_T},
-                                   context.device_context(), &temp_rtol_tensor);
-    Tensor rtol_tensor = dito_T.Mul(temp_rtol_tensor, max_eigenvalue_tensor);
-    Tensor tol_tensor;
-    tol_tensor.mutable_data<T>(dim_out, context.GetPlace());
-    ElementwiseComputeEx<GreaterElementFunctor<T>, platform::CUDADeviceContext,
-                         T, T>(context, atol_tensor, &rtol_tensor, -1,
-                               GreaterElementFunctor<T>(), &tol_tensor);
-
-    tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1));
-
-    Tensor compare_result;
-    compare_result.mutable_data<int64_t>(detail::NewAxisDim(dim_out, k),
-                                         context.GetPlace());
-    int axis = -1;
-    ElementwiseComputeEx<phi::funcs::GreaterThanFunctor<T, int64_t>,
-                         platform::CUDADeviceContext, T, int64_t>(
-        context, &eigenvalue_tensor, &tol_tensor, axis,
-        phi::funcs::GreaterThanFunctor<T, int64_t>(), &compare_result);
-    auto dito_int =
-        math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
-                                                 int64_t>(context);
-    std::vector<int> result_shape = phi::vectorize<int>(dim_out);
-    Tensor result = dito_int.ReduceSum(compare_result, result_shape);
-    out->ShareDataWith(result);
-  }
-
-  void GesvdjBatched(const platform::CUDADeviceContext& dev_ctx, int batchSize,
-                     int m, int n, int k, T* A, T* U, T* V, T* S, int* info,
-                     int thin_UV = 1) const;
-
-  void SyevjBatched(const platform::CUDADeviceContext& dev_ctx, int batchSize,
-                    int n, T* A, T* W, int* info) const;
-};
-
-template <>
-void MatrixRankGPUKernel<float>::GesvdjBatched(
-    const platform::CUDADeviceContext& dev_ctx, int batchSize, int m, int n,
-    int k, float* A, float* U, float* V, float* S, int* info,
-    int thin_UV) const {
-  // do not compute singular vectors
-  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
-  gesvdjInfo_t gesvdj_params = NULL;
-  int lda = m;
-  int ldu = m;
-  int ldt = n;
-  int lwork = 0;
-  auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj_bufferSize(
-      handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork,
-      gesvdj_params));
-  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
-  float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
-  int stride_A = lda * n;
-  int stride_U = ldu * (thin_UV ? k : m);
-  int stride_V = ldt * (thin_UV ? k : n);
-  for (int i = 0; i < batchSize; i++) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj(
-        handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i,
-        U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork,
-        info, gesvdj_params));
-    int error_info;
-    memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info,
-                 sizeof(int), dev_ctx.stream());
-    PADDLE_ENFORCE_EQ(
-        error_info, 0,
-        platform::errors::PreconditionNotMet(
-            "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
-  }
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
-}
-
-template <>
-void MatrixRankGPUKernel<double>::GesvdjBatched(
-    const platform::CUDADeviceContext& dev_ctx, int batchSize, int m, int n,
-    int k, double* A, double* U, double* V, double* S, int* info,
-    int thin_UV) const {
-  // do not compute singular vectors
-  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
-  gesvdjInfo_t gesvdj_params = NULL;
-  int lda = m;
-  int ldu = m;
-  int ldt = n;
-  int lwork = 0;
-  auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj_bufferSize(
-      handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork,
-      gesvdj_params));
-  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
-  double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
-  int stride_A = lda * n;
-  int stride_U = ldu * (thin_UV ? k : m);
-  int stride_V = ldt * (thin_UV ? k : n);
-  for (int i = 0; i < batchSize; ++i) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj(
-        handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i,
-        U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork,
-        info, gesvdj_params));
-    // check the error info
-    int error_info;
-    memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info,
-                 sizeof(int), dev_ctx.stream());
-    PADDLE_ENFORCE_EQ(
-        error_info, 0,
-        platform::errors::PreconditionNotMet(
-            "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
-  }
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
-}
-
-template <>
-void MatrixRankGPUKernel<float>::SyevjBatched(
-    const platform::CUDADeviceContext& dev_ctx, int batchSize, int n, float* A,
-    float* W, int* info) const {
-  auto handle = dev_ctx.cusolver_dn_handle();
-  // Compute eigenvalues only
-  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
-  // matrix is saved as column-major in cusolver.
-  // numpy and torch use lower triangle to compute eigenvalues, so here use
-  // upper triangle
-  cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER;
-  int lda = n;
-  int stride_A = lda * n;
-  int lwork = 0;
-  syevjInfo_t params = NULL;
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnCreateSyevjInfo(&params));
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj_bufferSize(
-      handle, jobz, uplo, n, A, lda, W, &lwork, params));
-  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
-  float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
-  for (int i = 0; i < batchSize; i++) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj(
-        handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr,
-        lwork, info, params));
-
-    int error_info;
-    memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info,
-                 sizeof(int), dev_ctx.stream());
-    PADDLE_ENFORCE_EQ(
-        error_info, 0,
-        platform::errors::PreconditionNotMet(
-            "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i,
-            error_info));
-  }
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnDestroySyevjInfo(params));
-}
-
-template <>
-void MatrixRankGPUKernel<double>::SyevjBatched(
-    const platform::CUDADeviceContext& dev_ctx, int batchSize, int n, double* A,
-    double* W, int* info) const {
-  auto handle = dev_ctx.cusolver_dn_handle();
-  // Compute eigenvalues only
-  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
-  //  upper triangle of A is stored
-  cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER;
-  int lda = n;
-  int stride_A = lda * n;
-  int lwork = 0;
-  syevjInfo_t params = NULL;
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnCreateSyevjInfo(&params));
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDsyevj_bufferSize(
-      handle, jobz, uplo, n, A, lda, W, &lwork, params));
-  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
-  double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
-
-  for (int i = 0; i < batchSize; i++) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDsyevj(
-        handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr,
-        lwork, info, params));
-    int error_info;
-    memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info,
-                 sizeof(int), dev_ctx.stream());
-    PADDLE_ENFORCE_EQ(
-        error_info, 0,
-        platform::errors::PreconditionNotMet(
-            "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i,
-            error_info));
-  }
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnDestroySyevjInfo(params));
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(matrix_rank, ops::MatrixRankGPUKernel<float>,
-                        ops::MatrixRankGPUKernel<double>);
-#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc
index 056620db5b966..32ef052119883 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,69 +23,6 @@ class AccuracyOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Out"), true,
-        platform::errors::NotFound("Input (Out) of AccuracyOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Indices"), true,
-                      platform::errors::NotFound(
-                          "Input (Indices) of AccuracyOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Label"), true,
-                      platform::errors::NotFound(
-                          "Input (Label) of AccuracyOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Accuracy"), true,
-                      platform::errors::NotFound(
-                          "Output (Accuracy) of AccuracyOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Correct"), true,
-                      platform::errors::NotFound(
-                          "Output (Correct) of AccuracyOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Total"), true,
-                      platform::errors::NotFound(
-                          "Output (Total) of AccuracyOp is not found."));
-
-    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "Accuracy");
-    OP_INOUT_CHECK(ctx->HasInput("Indices"), "Input", "Indices", "Accuracy");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "Accuracy");
-    OP_INOUT_CHECK(ctx->HasOutput("Accuracy"), "Output", "Accuracy",
-                   "Accuracy");
-    OP_INOUT_CHECK(ctx->HasOutput("Correct"), "Output", "Correct", "Accuracy");
-    OP_INOUT_CHECK(ctx->HasOutput("Total"), "Output", "Total", "Accuracy");
-
-    auto inference_dim = ctx->GetInputDim("Out");
-    auto label_dim = ctx->GetInputDim("Label");
-    // Assume indices has same shape as inference, because
-    // it's the output of topk.
-
-    PADDLE_ENFORCE_EQ(
-        label_dim.size(), 2,
-        platform::errors::InvalidArgument(
-            "ShapeError: label's dimensions of AccuracyOp must be 2. "
-            "But received label's dimensions = %d, label's shape = [%s]",
-            label_dim.size(), label_dim));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(label_dim[1], 1,
-                        platform::errors::InvalidArgument(
-                            "ShapeError: label's second dimension of "
-                            "AccuracyOp must be 1. But received label's "
-                            "second dimension is = %d, label's shape = [%s]",
-                            label_dim[1], label_dim));
-      PADDLE_ENFORCE_EQ(
-          inference_dim[0], label_dim[0],
-          platform::errors::InvalidArgument(
-              "ShapeError: the output's num_rows of AccuracyOp must be"
-              " the same as label's num_rows. But received output's "
-              "shape = [%s], label's shape = [%s], output's num_rows = %d, "
-              "label's "
-              "num_rows = %d",
-              inference_dim, label_dim, inference_dim[0], label_dim[0]));
-    }
-
-    ctx->SetOutputDim("Accuracy", {1});
-    ctx->SetOutputDim("Correct", {1});
-    ctx->SetOutputDim("Total", {1});
-    ctx->ShareLoD("Out", /*->*/ "Accuracy");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -125,8 +64,11 @@ with the input Out(Inference).
 
 // FIXME(typhoonzero): types of T is for infernece data.
 // label data is always int.
+DECLARE_INFER_SHAPE_FUNCTOR(accuracy, AccuracyInferShapeFunctor,
+                            PD_INFER_META(phi::AccuracyInferMeta));
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(
     accuracy, ops::AccuracyOp, ops::AccuracyOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    AccuracyInferShapeFunctor);
diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
index 90e6a36220ab0..2e82b47e8da1c 100644
--- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/phi/common/data_type.h"
 
 namespace paddle {
 namespace operators {
@@ -139,7 +140,7 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     layer_norm_p->execute(astream, args);
     astream.wait();
 
-    y->set_layout(DataLayout::kMKLDNN);
+    y->set_layout(phi::DataLayout::kMKLDNN);
     y->set_format(platform::GetMKLDNNFormat(*dst_memory));
   }
 };
@@ -150,4 +151,5 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 // TODO(jczaja): Enable FP32 when performance is good
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(layer_norm, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::LayerNormMKLDNNOpKernel<float>,
                    ops::LayerNormMKLDNNOpKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index ab02d4cfed9d5..1078b451c55ba 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/pool_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 namespace paddle {
 namespace operators {
 
 using framework::DataLayout;
+using framework::Tensor;
 using dnnl::memory;
 using dnnl::pooling_backward;
 using dnnl::pooling_forward;
@@ -83,11 +85,11 @@ class PoolingMKLDNNHandler
         phi::slice_ddim(input_dims, 2, input_dims.size());
 
     if (global_pooling) {
-      operators::UpdateKsize(&ksize, data_dims);
+      phi::funcs::UpdateKernelSize(&ksize, data_dims);
     }
 
-    operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm,
-                             data_dims, strides, ksize);
+    phi::funcs::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm,
+                              data_dims, strides, ksize);
 
     const auto src_tz = phi::vectorize(input->dims());
     const auto dst_tz = phi::vectorize(output->dims());
@@ -173,11 +175,11 @@ class PoolingMKLDNNHandler
     framework::DDim data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
 
     if (global_pooling) {
-      operators::UpdateKsize(&ksize, data_dims);
+      phi::funcs::UpdateKernelSize(&ksize, data_dims);
     }
 
-    operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm,
-                             data_dims, strides, ksize);
+    phi::funcs::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm,
+                              data_dims, strides, ksize);
 
     auto src_tz = phi::vectorize<int64_t>(in_x->dims());
     auto diff_src_tz = phi::vectorize<int64_t>(in_x_grad->dims());
diff --git a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc
index bdb4fe1198a8e..86ecb01c89af7 100644
--- a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc
@@ -50,13 +50,8 @@ class PReluMKLDNNHandler
       if (weights->dims().size() != x->dims().size()) {
         auto new_weights_dims = std::vector<int64_t>(x->dims().size(), 1);
         if (mode == "channel") {
-          if (data_format == "NHWC") {
-            new_weights_dims[x->dims().size() - 1] =
-                *std::max_element(weights_dims.begin(), weights_dims.end());
-          } else {
-            new_weights_dims[1] =
-                *std::max_element(weights_dims.begin(), weights_dims.end());
-          }
+          new_weights_dims[1] =
+              *std::max_element(weights_dims.begin(), weights_dims.end());
         }
         weights_dims = std::move(new_weights_dims);
       }
diff --git a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
index 780c6e7f153e7..a3b764b0e1c46 100644
--- a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
@@ -13,19 +13,32 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/shape_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
 namespace paddle {
 namespace operators {
 
-using paddle::framework::Tensor;
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = phi::SelectedRows;
 
 template <typename T>
-class ShapeMKLDNNKernel : public ShapeKernel<T> {
+class ShapeMKLDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ShapeKernel<T>::Compute(ctx);
+    auto* in_var = ctx.InputVar("Input");
+    framework::DDim in_dims;
+    if (in_var->IsType<phi::SelectedRows>()) {
+      in_dims = in_var->Get<phi::SelectedRows>().value().dims();
+    } else {
+      in_dims = in_var->Get<LoDTensor>().dims();
+    }
+    auto* out_t = ctx.Output<Tensor>("Out");
+    out_t->Resize({in_dims.size()});
+    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
+    for (int i = 0; i < in_dims.size(); ++i) {
+      out_data[i] = in_dims[i];
+    }
 
     auto* out = ctx.Output<Tensor>("Out");
     out->set_layout(framework::DataLayout::kMKLDNN);
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
index e9dadd5ec937c..4090d5ffca801 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
@@ -24,6 +24,7 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
@@ -32,6 +33,8 @@ USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
 
+PD_DECLARE_KERNEL(softmax, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
index 916f02179b364..0e988557df626 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
@@ -24,14 +24,18 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/kernel_registry.h"
 
-USE_OP(pool2d);
+USE_OP_ITSELF(pool2d);
 USE_OP_DEVICE_KERNEL(pool2d, MKLDNN);
 USE_OP_ITSELF(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP_ITSELF(transpose);
 USE_OP_DEVICE_KERNEL(transpose, MKLDNN);
 
+PD_DECLARE_KERNEL(pool2d, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace operators {
 
@@ -94,7 +98,7 @@ TEST(test_pool2d_transpose_nhwc, cpu_place) {
 
 TEST(test_pool2d_relu_relu_nhwc, cpu_place) {
   framework::DDim dims({1, 4, 8, 512});           // NHWC shape
-  framework::DDim expected_dims({1, 512, 3, 7});  // NHWC expected shape
+  framework::DDim expected_dims({1, 512, 3, 7});  // NCHW expected shape
   platform::CPUPlace p;
   framework::Scope scope;
 
diff --git a/paddle/fluid/operators/mode_op.cc b/paddle/fluid/operators/mode_op.cc
index c7fb92cd5107c..9c16ccb138f7d 100644
--- a/paddle/fluid/operators/mode_op.cc
+++ b/paddle/fluid/operators/mode_op.cc
@@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/mode_op.h"
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -23,43 +27,6 @@ class ModeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "mode");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "mode");
-    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "mode");
-
-    auto input_dims = ctx->GetInputDim("X");
-    const int& dim_size = input_dims.size();
-    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
-    PADDLE_ENFORCE_EQ(
-        (axis < dim_size) && (axis >= (-1 * dim_size)), true,
-        paddle::platform::errors::InvalidArgument(
-            "the axis of ModeOp must be [-%d, %d), but you set axis is %d",
-            dim_size, dim_size, axis));
-    PADDLE_ENFORCE_GE(input_dims.size(), 1,
-                      paddle::platform::errors::InvalidArgument(
-                          "input of ModeOp must have >= 1d shape"));
-    if (axis < 0) axis += dim_size;
-    bool keepdim = ctx->Attrs().Get<bool>("keepdim");
-    std::vector<int64_t> dimvec;
-    for (int64_t i = 0; i < axis; i++) {
-      dimvec.emplace_back(input_dims[i]);
-    }
-    if (keepdim) {
-      dimvec.emplace_back(static_cast<int64_t>(1));
-    }
-    for (int64_t i = axis + 1; i < dim_size; i++) {
-      dimvec.emplace_back(input_dims[i]);
-    }
-    framework::DDim dims = phi::make_ddim(dimvec);
-    PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument(
-                                                "input shape should >= 1d"));
-    ctx->SetOutputDim("Out", dims);
-    ctx->SetOutputDim("Indices", dims);
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -138,18 +105,11 @@ class ModeGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(mode, ModeInferShapeFunctor,
+                            PD_INFER_META(phi::ModeInferMeta));
 REGISTER_OPERATOR(mode, ops::ModeOp, ops::ModeOpMaker,
                   ops::ModeGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ModeGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(mode,
-                       ops::ModeCPUKernel<paddle::platform::CPUPlace, float>,
-                       ops::ModeCPUKernel<paddle::platform::CPUPlace, double>,
-                       ops::ModeCPUKernel<paddle::platform::CPUPlace, int32_t>,
-                       ops::ModeCPUKernel<paddle::platform::CPUPlace, int64_t>);
-
+                  ops::ModeGradOpMaker<paddle::imperative::OpBase>,
+                  ModeInferShapeFunctor);
 REGISTER_OPERATOR(mode_grad, ops::ModeOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    mode_grad, ops::ModeGradCPUKernel<paddle::platform::CPUPlace, float>,
-    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, double>,
-    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, int32_t>,
-    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/fluid/operators/mode_op.cu b/paddle/fluid/operators/mode_op.cu
deleted file mode 100644
index afb949d3374c6..0000000000000
--- a/paddle/fluid/operators/mode_op.cu
+++ /dev/null
@@ -1,233 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/inner_product.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mode_op.h"
-#include "paddle/fluid/operators/top_k_function_cuda.h"
-#include "paddle/fluid/operators/top_k_v2_op.h"
-
-namespace paddle {
-namespace operators {
-
-int ComputeBlockSize(int col) {
-  if (col > 512)
-    return 1024;
-  else if (col > 256 && col <= 512)
-    return 512;
-  else if (col > 128 && col <= 256)
-    return 256;
-  else if (col > 64 && col <= 128)
-    return 128;
-  else
-    return 64;
-}
-
-template <typename T>
-void getModebySort(const platform::CUDADeviceContext& ctx,
-                   const framework::Tensor* input_tensor,
-                   const int64_t num_cols, const int64_t num_rows,
-                   T* out_tensor, int64_t* indices_tensor) {
-  framework::Tensor input_tmp;
-  framework::TensorCopy(*input_tensor, ctx.GetPlace(), &input_tmp);
-  T* input_tmp_data = input_tmp.mutable_data<T>(ctx.GetPlace());
-  input_tmp.Resize(phi::make_ddim({num_rows, num_cols}));
-  thrust::device_ptr<T> out_tensor_ptr(out_tensor);
-  thrust::device_ptr<int64_t> indices_tensor_ptr(indices_tensor);
-
-  for (int64_t i = 0; i < num_rows; ++i) {
-    T* begin = input_tmp_data + num_cols * i;
-    T* end = input_tmp_data + num_cols * (i + 1);
-    thrust::device_vector<int64_t> indices_data(num_cols);
-    thrust::sequence(thrust::device, indices_data.begin(),
-                     indices_data.begin() + num_cols);
-    thrust::sort_by_key(thrust::device, begin, end, indices_data.begin());
-    int unique = 1 + thrust::inner_product(thrust::device, begin, end - 1,
-                                           begin + 1, 0, thrust::plus<int>(),
-                                           thrust::not_equal_to<T>());
-    thrust::device_vector<T> keys_data(unique);
-    thrust::device_vector<int64_t> cnts_data(unique);
-    thrust::reduce_by_key(thrust::device, begin, end,
-                          thrust::constant_iterator<int>(1), keys_data.begin(),
-                          cnts_data.begin());
-    auto it = thrust::max_element(thrust::device, cnts_data.begin(),
-                                  cnts_data.begin() + unique);
-    T mode = keys_data[it - cnts_data.begin()];
-    int64_t counts = cnts_data[it - cnts_data.begin()];
-    auto pos = thrust::find(thrust::device, begin, end, mode);
-    int64_t index = indices_data[pos - begin + counts - 1];
-    out_tensor_ptr[i] = static_cast<T>(mode);
-    indices_tensor_ptr[i] = static_cast<int64_t>(index);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class ModeOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* output = ctx.Output<framework::Tensor>("Out");
-    auto* indices = ctx.Output<framework::Tensor>("Indices");
-    int axis = static_cast<int>(ctx.Attr<int>("axis"));
-    bool keepdim = static_cast<bool>(ctx.Attr<bool>("keepdim"));
-
-    // get the input dims
-    const auto& in_dims = input->dims();
-    // calcluate the real axis
-    if (axis < 0) axis += in_dims.size();
-
-    auto out_dims = output->dims();
-
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    if (axis == in_dims.size() - 1) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
-      getModebySort<T>(dev_ctx, input, input_width, input_height, output_data,
-                       indices_data);
-    } else {
-      std::vector<int> trans_axis;
-      for (int i = 0; i < axis; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(axis);
-
-      if (!keepdim) {
-        std::vector<int> tmp_out_shape;
-        for (int i = 0; i < axis; i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        tmp_out_shape.emplace_back(1);
-        for (int i = axis + 1; i < in_dims.size(); i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
-        output->Resize(tmp_out_dim);
-        indices->Resize(tmp_out_dim);
-      }
-
-      framework::DDim trans_shape(in_dims);
-      framework::DDim trans_out_shape(in_dims);
-      for (int i = 0; i < trans_axis.size(); i++) {
-        trans_shape[i] = in_dims[trans_axis[i]];
-        trans_out_shape[i] = in_dims[trans_axis[i]];
-      }
-      trans_out_shape[in_dims.size() - 1] = 1;
-
-      // second step, tranpose the input
-      framework::Tensor trans_input;
-      trans_input.mutable_data<T>(trans_shape, ctx.GetPlace());
-      int ndims = trans_axis.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
-                                                   &trans_input, trans_axis);
-      framework::Tensor trans_ind;
-      int64_t* trans_ind_data =
-          trans_ind.mutable_data<int64_t>(trans_out_shape, ctx.GetPlace());
-      framework::Tensor trans_out;
-      T* trans_out_data =
-          trans_out.mutable_data<T>(trans_out_shape, ctx.GetPlace());
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
-      const int64_t input_width = trans_shape[trans_shape.size() - 1];
-      getModebySort<T>(dev_ctx, &trans_input, input_width, input_height,
-                       trans_out_data, trans_ind_data);
-      // last step, tranpose back the indices and output
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, trans_ind, indices, trans_axis);
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
-                                                   output, trans_axis);
-      if (!keepdim) {
-        output->Resize(out_dims);
-        indices->Resize(out_dims);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ModeOpGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(context.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<framework::Tensor>("Indices");
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    int axis = context.Attr<int>("axis");
-
-    const auto& in_dims = x->dims();
-    auto out_dims = indices->dims();
-
-    if (axis < 0) axis += in_dims.size();
-    // allocate the cuda memory for the x_grad
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    const T* out_grad_data = out_grad->data<T>();
-    const int64_t* indices_data = indices->data<int64_t>();
-
-    int pre, n, post;
-    GetDims(in_dims, axis, &pre, &n, &post);
-
-    // calcluate the block and grid num
-    auto& dev_ctx = context.cuda_device_context();
-    int block_size = ComputeBlockSize(post);
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
-    int grid_size = std::min(max_blocks, pre);
-    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
-        out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    mode, ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    mode_grad,
-    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/mode_op.h b/paddle/fluid/operators/mode_op.h
deleted file mode 100644
index 76d356ed16eb3..0000000000000
--- a/paddle/fluid/operators/mode_op.h
+++ /dev/null
@@ -1,317 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <iostream>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/transpose_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename Type>
-static void getMode(Type input_height, Type input_width, int input_dim,
-                    const framework::Tensor* input, T* t_out, Type* t_indices) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    std::vector<std::pair<T, Type>> col_vec;
-    col_vec.reserve(input_width);
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
-      }
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
-      }
-    }
-    std::sort(col_vec.begin(), col_vec.end(),
-              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-                return (!std::isnan(static_cast<double>(l.first)) &&
-                        std::isnan(static_cast<double>(r.first))) ||
-                       (l.first < r.first);
-              });
-    T mode = 0;
-    int64_t indice = 0;
-    int64_t cur_freq = 0;
-    int64_t max_freq = 0;
-    for (int64_t i = 0; i < input_width; ++i) {
-      ++cur_freq;
-      if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) {
-        if (cur_freq > max_freq) {
-          max_freq = cur_freq;
-          mode = col_vec[i].first;
-          indice = col_vec[i].second;
-        }
-        cur_freq = 0;
-      }
-    }
-    t_out[i] = mode;
-    t_indices[i] = indice;
-  }
-}
-
-template <typename T, typename Type>
-static void ModeAssign(const Type& input_height, const Type& input_width,
-                       const int& input_dim, const framework::Tensor* input,
-                       const framework::Tensor* indices, T* output_data) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
-      output_data[i * input_width + e_indices(0)] = e_input(0);
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices =
-          framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
-      output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class ModeCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<framework::Tensor>("X");
-    auto* output = context.Output<framework::Tensor>("Out");
-    auto* indices = context.Output<framework::Tensor>("Indices");
-    const auto& in_dims = input->dims();
-    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
-
-    // axis < 0, cacluate the real axis
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    if (axis < 0) axis += in_dims.size();
-
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
-    auto out_dims = output->dims();
-    // if axis is not the last dim, transpose it to the last dim, do the
-    // calculation,
-    // then tranpose it back to orginal axis.
-    if (axis == in_dims.size() - 1) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      getMode<T, int64_t>(input_height, input_width, in_dims.size(), input,
-                          output_data, indices_data);
-    } else {
-      std::vector<int> trans_axis;
-      for (int i = 0; i < axis; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(axis);
-
-      if (!keepdim) {
-        std::vector<int> tmp_out_shape;
-        for (int i = 0; i < axis; i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        tmp_out_shape.emplace_back(1);
-        for (int i = axis + 1; i < in_dims.size(); i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
-        output->Resize(tmp_out_dim);
-        indices->Resize(tmp_out_dim);
-      }
-
-      // get the trans input_dims, out_dims
-      framework::DDim trans_shape(in_dims);
-      framework::DDim trans_out_shape(in_dims);
-
-      for (size_t i = 0; i < trans_axis.size(); i++) {
-        trans_shape[i] = in_dims[trans_axis[i]];
-        trans_out_shape[i] = in_dims[trans_axis[i]];
-      }
-      trans_out_shape[in_dims.size() - 1] = 1;
-
-      framework::Tensor trans_input;
-      trans_input.mutable_data<T>(trans_shape, context.GetPlace());
-      int ndims = trans_axis.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-
-      // transpose the input value
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
-                                                  &trans_input, trans_axis);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
-      const int64_t input_width = trans_shape[trans_shape.size() - 1];
-      framework::Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_out_shape, context.GetPlace());
-      framework::Tensor tmp_indices;
-      auto* t_ind = tmp_indices.mutable_data<int64_t>(trans_out_shape,
-                                                      context.GetPlace());
-
-      getMode<T, int64_t>(input_height, input_width, in_dims.size(),
-                          &trans_input, t_out, t_ind);
-      // transpose back
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_context, tmp_indices, indices, trans_axis);
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  output, trans_axis);
-      if (!keepdim) {
-        output->Resize(out_dims);
-        indices->Resize(out_dims);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ModeGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<framework::Tensor>("Indices");
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
-
-    auto in_dims = x->dims();
-    auto out_dims = indices->dims();
-
-    // axis < 0, get the real axis
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    if (!keepdim) {
-      std::vector<int> tmp_out_shape;
-      for (int i = 0; i < axis; i++) {
-        tmp_out_shape.emplace_back(out_dims[i]);
-      }
-      tmp_out_shape.emplace_back(1);
-      for (int i = axis + 1; i < in_dims.size(); i++) {
-        tmp_out_shape.emplace_back(out_dims[i - 1]);
-      }
-      out_dims = phi::make_ddim(tmp_out_shape);
-    }
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    if (axis == in_dims.size() - 1) {
-      // allocate the memory for the input_grad
-      // assign the out_grad to input_grad directly
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-
-      // init the output grad with 0, because some input elements has no grad
-      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
-      // Assign the output_grad to input_grad
-      if (keepdim) {
-        ModeAssign(input_height, input_width, in_dims.size(), out_grad, indices,
-                   x_grad_data);
-      } else {
-        auto& dev_context =
-            context.template device_context<platform::CPUDeviceContext>();
-        framework::Tensor out_grad_tmp;
-        framework::Tensor indices_tmp;
-        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
-        indices_tmp.mutable_data<int64_t>(indices->dims(),
-                                          dev_context.GetPlace());
-        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
-                              &out_grad_tmp);
-        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
-                              &indices_tmp);
-        out_grad_tmp.Resize(out_dims);
-        indices_tmp.Resize(out_dims);
-        ModeAssign(input_height, input_width, in_dims.size(), &out_grad_tmp,
-                   &indices_tmp, x_grad_data);
-      }
-    } else {
-      // can not assign grad to input_grad, must do the transpose
-      std::vector<int> trans_axis;
-      for (int i = 0; i < axis; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(out_dims.size() - 1);
-      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(axis);
-      framework::DDim trans_shape(out_dims);
-      framework::DDim trans_in_shape(in_dims);
-      for (size_t i = 0; i < trans_axis.size(); i++) {
-        trans_shape[i] = out_dims[trans_axis[i]];
-        trans_in_shape[i] = in_dims[trans_axis[i]];
-      }
-      // transpose the out_grad, indices
-      framework::Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_shape, context.GetPlace());
-      framework::Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_shape, context.GetPlace());
-      int ndims = trans_axis.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-
-      if (keepdim) {
-        // Do transpose
-        TransCompute<platform::CPUDeviceContext, T>(
-            ndims, dev_context, *out_grad, &trans_dO, trans_axis);
-        TransCompute<platform::CPUDeviceContext, int64_t>(
-            ndims, dev_context, *indices, &trans_ind, trans_axis);
-      } else {
-        framework::Tensor out_grad_tmp;
-        framework::Tensor indices_tmp;
-        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
-        indices_tmp.mutable_data<int64_t>(indices->dims(),
-                                          dev_context.GetPlace());
-        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
-                              &out_grad_tmp);
-        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
-                              &indices_tmp);
-        out_grad_tmp.Resize(out_dims);
-        indices_tmp.Resize(out_dims);
-        // Do transpose
-        TransCompute<platform::CPUDeviceContext, T>(
-            ndims, dev_context, out_grad_tmp, &trans_dO, trans_axis);
-        TransCompute<platform::CPUDeviceContext, int64_t>(
-            ndims, dev_context, indices_tmp, &trans_ind, trans_axis);
-      }
-      const int64_t input_height = phi::product(
-          phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1));
-      const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1];
-
-      // Assign the out_grad to tranpose input_grad
-      framework::Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_in_shape, context.GetPlace());
-      memset(t_out, 0, x_grad->numel() * sizeof(T));
-
-      ModeAssign<T, int64_t>(input_height, input_width, in_dims.size(),
-                             &trans_dO, &trans_ind, t_out);
-
-      // Transpose back
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  x_grad, trans_axis);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc
index b309e1b87ef90..5b107ce643df3 100644
--- a/paddle/fluid/operators/multi_dot_op.cc
+++ b/paddle/fluid/operators/multi_dot_op.cc
@@ -16,77 +16,19 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 
-/**
- * @brief compute the output shape and check the input shape valid or not
- */
-inline framework::DDim ComputeAndCheckShape(
-    const bool is_runtime, const std::vector<framework::DDim>& inputs_dims) {
-  const size_t n = inputs_dims.size();
-  auto first_dim = inputs_dims[0];
-
-  bool is_vector = false;
-  framework::DDim out_dim;
-
-  PADDLE_ENFORCE_LT(
-      first_dim.size(), static_cast<size_t>(3),
-      platform::errors::InvalidArgument(
-          "multi_dot: the first input tensor must be 1D or 2D but got[%d]!",
-          static_cast<int>(first_dim.size())));
-
-  // If the first tensor is 1D of size n view it as a row vector (1, n)
-  if (first_dim.size() == 1) {
-    first_dim = phi::make_ddim({1, static_cast<int>(first_dim[0])});
-    is_vector = true;
-  }
-
-  auto last_dim = inputs_dims[n - 1];
-  PADDLE_ENFORCE_LT(
-      last_dim.size(), static_cast<size_t>(3),
-      platform::errors::InvalidArgument(
-          "the last input tensor of multi_dot must be 1D or 2D but got[%d]!",
-          static_cast<int>(first_dim.size())));
-
-  // If the last tensor is 1D of size n view it as a column vector (n, 1)
-  if (last_dim.size() == 1) {
-    last_dim = phi::make_ddim({static_cast<int>(last_dim[0]), 1});
-    out_dim = is_vector ? phi::make_ddim({1}) : phi::make_ddim({first_dim[0]});
-  } else {
-    out_dim = is_vector ? phi::make_ddim({last_dim[1]})
-                        : phi::make_ddim({first_dim[0], last_dim[1]});
-  }
-
-  auto width = first_dim[1];
-  for (size_t i = 1; i < n - 1; i++) {
-    PADDLE_ENFORCE_EQ(inputs_dims[i].size(), static_cast<size_t>(2),
-                      platform::errors::InvalidArgument(
-                          "the input tensor of multi_dot op must be 2D."));
-
-    const auto& tmp_dim = inputs_dims[i];
-    PADDLE_ENFORCE_EQ(
-        tmp_dim[0], width,
-        platform::errors::InvalidArgument(
-            "the input matrix does not meet the multiplication requirements."));
-    width = tmp_dim[1];
-  }
-
-  PADDLE_ENFORCE_EQ(
-      last_dim[0], width,
-      platform::errors::InvalidArgument(
-          "the input matrix does not meet the multiplication requirements."));
-
-  return out_dim;
-}
-
 class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -105,22 +47,6 @@ If the first argument is 1-D it is treated as a row vector. If the last argument
 class MultiDotOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "multi_dot");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "multi_dot");
-
-    auto inputs_dims = ctx->GetInputsDim("X");
-
-    const size_t inputs_num = inputs_dims.size();
-    PADDLE_ENFORCE_GT(
-        inputs_num, static_cast<size_t>(1),
-        platform::errors::InvalidArgument(
-            "The number of input tensors in multi_dot op should > 1."));
-    auto out_dims = ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims);
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", "Out");
-  }
 };
 
 class MultiDotOpGrad : public framework::OperatorWithKernel {
@@ -171,9 +97,15 @@ class MultiDotOpDoubleGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(multi_dot, MultiDotInferShapeFunctor,
+                            PD_INFER_META(phi::MultiDotInferMeta));
+
 REGISTER_OPERATOR(multi_dot, ops::MultiDotOp, ops::MultiDotOpMaker,
                   ops::MultiDotOpGradMaker<paddle::framework::OpDesc>,
-                  ops::MultiDotOpGradMaker<paddle::imperative::OpBase>);
+                  ops::MultiDotOpGradMaker<paddle::imperative::OpBase>,
+                  MultiDotInferShapeFunctor);
+
 REGISTER_OPERATOR(multi_dot_grad, ops::MultiDotOpGrad,
                   ops::MultiDotOpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::MultiDotOpDoubleGradMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
index 313a479ea301b..8771a6573cba0 100644
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/multiplex_op.h"
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -169,15 +169,3 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
                   ops::MultiplexGradMaker<paddle::framework::OpDesc>,
                   ops::MultiplexGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
-REGISTER_OP_CPU_KERNEL(
-    multiplex,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    multiplex_grad,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu
deleted file mode 100644
index 0a32ee96fb693..0000000000000
--- a/paddle/fluid/operators/multiplex_op.cu
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/multiplex_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename Place, typename T>
-class MultiplexGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto ins = ctx.MultiInput<Tensor>("X");
-    auto* ids = ctx.Input<Tensor>("Ids");
-    auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    for (size_t i = 0; i < ins.size(); ++i) {
-      PADDLE_ENFORCE_GT(
-          ins[i]->numel(), 0,
-          platform::errors::OutOfRange(
-              "indexing will be out of bounds with size 0 for the %d-th input.",
-              i));
-    }
-
-    auto rows = ins[0]->dims()[0];
-    auto cols = ins[0]->numel() / rows;
-    // copy index to cpu
-    Tensor index_t_cpu;
-    paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
-    auto* index = index_t_cpu.data<int32_t>();
-    auto stream = ctx.cuda_device_context().stream();
-    platform::CUDAPlace place = ctx.GetPlace();
-    for (auto i = 0; i < rows; i++) {
-      int32_t k = index[i];
-      PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet(
-                                  "index must be nonnegative."));
-      PADDLE_ENFORCE_LT(static_cast<size_t>(k), ins.size(),
-                        platform::errors::PreconditionNotMet(
-                            "index exceeds the number of candidate tensors."));
-      memory::Copy(place, out->data<T>() + i * cols, place,
-                   ins[k]->data<T>() + i * cols, cols * sizeof(T), stream);
-    }
-  }
-};
-
-template <typename Place, typename T>
-class MultiplexGradGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* ids = ctx.Input<Tensor>("Ids");
-    auto d_ins = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
-
-    size_t idx = -1UL;
-    for (size_t i = 0; i < d_ins.size(); i++) {
-      if (d_ins[i]) {
-        d_ins[i]->mutable_data<T>(ctx.GetPlace());
-        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
-        t.device(*ctx.template device_context<Place>().eigen_device()) =
-            t.constant(static_cast<T>(0));
-
-        idx = i;
-      }
-    }
-
-    if (idx == -1UL) return;
-
-    auto rows = d_ins[idx]->dims()[0];
-    auto cols = d_ins[idx]->numel() / rows;
-    // copy index to cpu
-    Tensor index_t_cpu;
-    paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
-    auto* index = index_t_cpu.data<int32_t>();
-
-    auto stream = ctx.cuda_device_context().stream();
-    platform::CUDAPlace place = ctx.GetPlace();
-    for (auto i = 0; i < rows; i++) {
-      size_t k = static_cast<size_t>(index[i]);
-      if (d_ins[k]) {
-        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
-                     d_out->data<T>() + i * cols, cols * sizeof(T), stream);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    multiplex,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    multiplex_grad,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/multiplex_op.h b/paddle/fluid/operators/multiplex_op.h
deleted file mode 100644
index 1d0a009edeedc..0000000000000
--- a/paddle/fluid/operators/multiplex_op.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class MultiplexCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto ids = ctx.Input<framework::Tensor>("Ids");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    for (size_t i = 0; i < ins.size(); ++i) {
-      PADDLE_ENFORCE_GT(
-          ins[i]->numel(), 0,
-          platform::errors::OutOfRange(
-              "indexing will be out of bounds with size 0 for the %d-th input.",
-              i));
-    }
-
-    auto rows = ins[0]->dims()[0];
-    auto cols = ins[0]->numel() / rows;
-    auto index = ids->data<int32_t>();
-    platform::CPUPlace place = ctx.GetPlace();
-    for (auto i = 0; i < rows; i++) {
-      int32_t k = index[i];
-      PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet(
-                                  "index must be nonnegative."));
-      PADDLE_ENFORCE_LT(static_cast<size_t>(k), ins.size(),
-                        platform::errors::PreconditionNotMet(
-                            "index exceeds the number of candidate tensors."));
-      memory::Copy(place, out->data<T>() + i * cols, place,
-                   ins[k]->data<T>() + i * cols, cols * sizeof(T));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MultiplexGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* ids = ctx.Input<framework::Tensor>("Ids");
-    auto d_ins =
-        ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
-
-    size_t idx = -1UL;
-    for (size_t i = 0; i < d_ins.size(); i++) {
-      if (d_ins[i]) {
-        d_ins[i]->mutable_data<T>(ctx.GetPlace());
-        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
-        t.device(*ctx.template device_context<DeviceContext>().eigen_device()) =
-            t.constant(static_cast<T>(0));
-
-        idx = i;
-      }
-    }
-
-    if (idx == -1UL) return;
-
-    auto rows = d_ins[idx]->dims()[0];
-    auto cols = d_ins[idx]->numel() / rows;
-    auto* index = ids->data<int32_t>();
-    platform::CPUPlace place = ctx.GetPlace();
-    for (auto i = 0; i < rows; i++) {
-      size_t k = static_cast<size_t>(index[i]);
-      if (d_ins[k]) {
-        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
-                     d_out->data<T>() + i * cols, cols * sizeof(T));
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc
index 6c35ad29e9749..a4e1f7b3091a9 100644
--- a/paddle/fluid/operators/nll_loss_op.cc
+++ b/paddle/fluid/operators/nll_loss_op.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,77 +25,6 @@ class NLLLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NLLLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "NLLLoss");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NLLLoss");
-    OP_INOUT_CHECK(ctx->HasOutput("Total_weight"), "Output", "Total_weight",
-                   "NLLLoss");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
-    auto reduction = ctx->Attrs().Get<std::string>("reduction");
-
-    PADDLE_ENFORCE_EQ(x_dims.size() == 2 || x_dims.size() == 4, true,
-                      platform::errors::InvalidArgument(
-                          "The tensor rank of Input(X) must be 2 or 4."));
-    bool contain_unknown_dim = phi::contain_unknown_dim(x_dims) ||
-                               phi::contain_unknown_dim(label_dims);
-    bool check = ctx->IsRuntime() || !contain_unknown_dim;
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[0], label_dims[0],
-          platform::errors::InvalidArgument(
-              "ShapeError: Expected input batch_size to match label batch_size,"
-              "But received: the Input(x) batch_size is [%s], the Input(label) "
-              " batch_size is [%s].",
-              x_dims[0], label_dims[0]));
-      if (ctx->HasInput("Weight")) {
-        auto w_dims = ctx->GetInputDim("Weight");
-        PADDLE_ENFORCE_EQ(w_dims.size(), 1,
-                          platform::errors::InvalidArgument(
-                              "Input(Weight) should be a 1D tensor."));
-        PADDLE_ENFORCE_EQ(
-            x_dims[1], w_dims[0],
-            platform::errors::InvalidArgument(
-                "Expected input tensor Weight's size should equal "
-                "to the first dimension of the input tensor X. But received "
-                "Weight's "
-                "size is %d, the first dimension of input X is %d",
-                w_dims[0], x_dims[1]));
-      }
-    }
-    if (x_dims.size() == 2) {
-      if (reduction == "none") {
-        ctx->SetOutputDim("Out", {x_dims[0]});
-      } else {
-        ctx->SetOutputDim("Out", {1});
-      }
-    } else if (x_dims.size() == 4) {
-      PADDLE_ENFORCE_EQ(label_dims.size(), 3,
-                        platform::errors::InvalidArgument(
-                            "Expected Input(Lable) dimensions=3, received %d.",
-                            label_dims.size()));
-      auto input0 = x_dims[0];
-      auto input2 = x_dims[2];
-      auto input3 = x_dims[3];
-      auto label0 = label_dims[0];
-      auto label1 = label_dims[1];
-      auto label2 = label_dims[2];
-      PADDLE_ENFORCE_EQ(
-          input0 == label0 && input2 == label1 && input3 == label2, true,
-          platform::errors::InvalidArgument("Input(X) tensor shape should "
-                                            "match to Input(Label) tensor "
-                                            "shape."));
-      if (reduction == "none") {
-        ctx->SetOutputDim("Out", {x_dims[0], x_dims[2], x_dims[3]});
-      } else {
-        ctx->SetOutputDim("Out", {1});
-      }
-    }
-    ctx->SetOutputDim("Total_weight", {1});
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -259,8 +190,11 @@ class NLLLossGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(nll_loss, NllLossRawInferShapeFunctor,
+                            PD_INFER_META(phi::NllLossRawInferMeta));
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(nll_loss, ops::NLLLossOp, ops::NLLLossOpMaker,
                   ops::NLLLossGradMaker<paddle::framework::OpDesc>,
-                  ops::NLLLossGradMaker<paddle::imperative::OpBase>);
+                  ops::NLLLossGradMaker<paddle::imperative::OpBase>,
+                  NllLossRawInferShapeFunctor);
 REGISTER_OPERATOR(nll_loss_grad, ops::NLLLossGradOp);
diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc
index 5d394424d54f5..51daccce0e882 100644
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -57,21 +59,7 @@ where, $\sum {x^2}$ is calculated along the `axis` dimension.
 };
 
 class NormOp : public framework::OperatorWithKernel {
- public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NormOp");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NormOp");
-    auto xdim = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", xdim);
-
-    if (ctx->Attrs().Get<bool>("is_test") == false) {
-      int axis = ctx->Attrs().Get<int>("axis");
-      if (axis < 0) axis = xdim.size() + axis;
-      xdim[axis] = 1;
-      ctx->SetOutputDim("Norm", xdim);
-    }
-  }
 };
 
 class NormOpGrad : public framework::OperatorWithKernel {
@@ -111,7 +99,11 @@ class NormOpGradOpMaker : public framework::SingleGradOpMaker<T> {
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 
+DECLARE_INFER_SHAPE_FUNCTOR(norm, NormInferShapeFunctor,
+                            PD_INFER_META(phi::NormInferMeta));
+
 REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker,
                   ops::NormOpGradOpMaker<paddle::framework::OpDesc>,
-                  ops::NormOpGradOpMaker<paddle::imperative::OpBase>);
+                  ops::NormOpGradOpMaker<paddle::imperative::OpBase>,
+                  NormInferShapeFunctor);
 REGISTER_OPERATOR(norm_grad, ops::NormOpGrad);
diff --git a/paddle/fluid/operators/number_count_op.cc b/paddle/fluid/operators/number_count_op.cc
new file mode 100644
index 0000000000000..8f7a3b82acf19
--- /dev/null
+++ b/paddle/fluid/operators/number_count_op.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/number_count_op.h"
+
+namespace paddle {
+namespace operators {
+
+class NumberCountOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("gate_idx"), "Input", "gate_idx",
+                   "NumberCount");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "number_count",
+                   "NumberCount");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    // the dtype of the gate_idx should be same as int64
+    auto gate_idx_dtype =
+        OperatorWithKernel::IndicateVarDataType(ctx, "gate_idx");
+
+    PADDLE_ENFORCE_EQ(gate_idx_dtype, framework::proto::VarType::INT64,
+                      platform::errors::InvalidArgument(
+                          "The dtype of the gate_idx_dtype should be int64"));
+    return framework::OpKernelType(gate_idx_dtype, ctx.GetPlace());
+  }
+};
+
+class NumberCountOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("gate_idx", "(Tensor) The input gate index tensor.");
+    AddOutput("Out", "(Tensor) The output expert count tensor.");
+    AddAttr<int>("upper_range", "（int), The number of experts.");
+
+    AddComment(R"DOC(number_count Operator.count gate indices.)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CPU_KERNEL(number_count, ops::NumberCountOpCPUKernel<int>,
+                       ops::NumberCountOpCPUKernel<int64_t>);
+
+REGISTER_OP_WITHOUT_GRADIENT(number_count, ops::NumberCountOp,
+                             ops::NumberCountOpMaker);
diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu
new file mode 100644
index 0000000000000..97e4b4f2845ae
--- /dev/null
+++ b/paddle/fluid/operators/number_count_op.cu
@@ -0,0 +1,108 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/number_count_op.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+#define CEIL(_x_, _y_) (((_x_)-1) / (_y_) + 1)
+#define PERTHREAD_EXPERTS 256
+#define WARP_SIZE 32
+
+const int CUDA_NUM_THREADS = 512;
+static inline int GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+template <typename T>
+__global__ void initialize_zero_kernel(T* data, const int length) {
+  CUDA_KERNEL_LOOP(idx, length) { data[idx] = static_cast<T>(0); }
+}
+
+template <typename T>
+__global__ void NumberCount(const T* gate_idx, T* number_count,
+                            int64_t batch_size, int upper_range) {
+  int res_tmp[PERTHREAD_EXPERTS] = {0};
+  int expert_min = blockIdx.x * PERTHREAD_EXPERTS;
+  int expert_max = expert_min + PERTHREAD_EXPERTS;
+  if (expert_max > upper_range) {
+    expert_max = upper_range;
+  }
+  for (int i = threadIdx.x; i < batch_size; i += blockDim.x) {
+    T idx = gate_idx[i];
+    if (idx == -1) {
+      continue;
+    }
+    if (idx < expert_min || idx >= expert_max) {
+      continue;
+    }
+    res_tmp[idx - expert_min] += 1;
+  }
+  for (int i = expert_min; i < expert_max; ++i) {
+    int x = res_tmp[i - expert_min];
+#pragma unroll
+    for (int j = 1; j < WARP_SIZE; j <<= 1) {
+#ifdef __HIPCC__
+      x = x + __shfl_down(x, j);
+#else
+      x = x + __shfl_down_sync(-1u, x, j);
+#endif
+    }
+    if (threadIdx.x % WARP_SIZE == 0) {
+      platform::CudaAtomicAdd(number_count + i, x);
+    }
+  }
+}
+
+template <typename T>
+class NumberCountOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto gate_idx = context.Input<LoDTensor>("gate_idx");
+    auto upper_range = context.Attr<int>("upper_range");
+    auto number_count = context.Output<LoDTensor>("Out");
+
+    int64_t batch_size = gate_idx->numel();
+    auto place = context.GetPlace();
+    const auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+
+    framework::DDim out_dims = phi::make_ddim({upper_range});
+    auto out_data = number_count->mutable_data<T>(out_dims, place);
+    const T* gate_data = gate_idx->data<T>();
+
+    initialize_zero_kernel<
+        T><<<GET_BLOCKS(upper_range), CUDA_NUM_THREADS, 0, dev_ctx.stream()>>>(
+        out_data, upper_range);
+
+    NumberCount<
+        T><<<CEIL(upper_range, PERTHREAD_EXPERTS), 256, 0, dev_ctx.stream()>>>(
+        gate_data, out_data, batch_size, upper_range);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(number_count, ops::NumberCountOpCUDAKernel<int64_t>);
diff --git a/paddle/fluid/operators/number_count_op.h b/paddle/fluid/operators/number_count_op.h
new file mode 100644
index 0000000000000..95e64946fb8a2
--- /dev/null
+++ b/paddle/fluid/operators/number_count_op.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#if defined(PADDLE_WITH_GLOO)
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class NumberCountOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support expert count op for cpu kernel now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/one_hot_v2_op.cc b/paddle/fluid/operators/one_hot_v2_op.cc
index e212f4e7e2b7d..122b6a8a80aac 100644
--- a/paddle/fluid/operators/one_hot_v2_op.cc
+++ b/paddle/fluid/operators/one_hot_v2_op.cc
@@ -12,9 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/one_hot_v2_op.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,26 +26,6 @@ namespace operators {
 class OneHotV2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "one_hot_v2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "one_hot_v2");
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(x_dims.size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Rank of Input(X) should be at least 1."));
-
-    int depth = ctx->Attrs().Get<int>("depth");
-    if (ctx->HasInput("depth_tensor")) {
-      depth = -1;
-    }
-
-    auto out_dims_vec = phi::vectorize(x_dims);
-    out_dims_vec.push_back(depth);
-    auto out_dims = phi::make_ddim(out_dims_vec);
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", /* --> */ "Out");
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -52,7 +36,7 @@ class OneHotV2Op : public framework::OperatorWithKernel {
   }
 
   framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
+      const std::string& var_name, const framework::Tensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "depth_tensor") {
       return expected_kernel_type;
@@ -114,10 +98,12 @@ Out is a LoDTensor:
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(one_hot_v2, OneHotInferShapeFunctor,
+                            PD_INFER_META(phi::OneHotRawInferMeta));
+
 REGISTER_OPERATOR(
     one_hot_v2, ops::OneHotV2Op, ops::OneHotV2OpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    one_hot_v2, ops::OneHotV2Kernel<paddle::platform::CPUDeviceContext, int>,
-    ops::OneHotV2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    OneHotInferShapeFunctor);
diff --git a/paddle/fluid/operators/one_hot_v2_op.cu b/paddle/fluid/operators/one_hot_v2_op.cu
deleted file mode 100644
index 77e2a931e50de..0000000000000
--- a/paddle/fluid/operators/one_hot_v2_op.cu
+++ /dev/null
@@ -1,100 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/one_hot_v2_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <typename InT, typename OutT>
-__global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data,
-                                 const int64_t numel, const int depth) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < numel && p_in_data[idx] >= 0 && p_in_data[idx] < depth) {
-    *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0;
-  }
-}
-
-template <typename DeviceContext, typename InT>
-struct OneHotV2OpCUDAFunctor {
-  const framework::LoDTensor* in_;
-  framework::LoDTensor* out_;
-  const DeviceContext& ctx_;
-  int depth_;
-
-  OneHotV2OpCUDAFunctor(const framework::LoDTensor* in,
-                        framework::LoDTensor* out, int depth,
-                        const DeviceContext& ctx)
-      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
-
-  template <typename OutT>
-  void apply() const {
-    auto* p_in_data = in_->data<InT>();
-    auto numel = in_->numel();
-    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
-    auto stream = ctx_.stream();
-    phi::funcs::set_constant(ctx_, out_, 0.0);
-
-    FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                           PADDLE_CUDA_NUM_THREADS,
-                       PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        p_in_data, p_out_data, numel, depth_);
-  }
-};
-
-using LoDTensor = framework::LoDTensor;
-template <typename DeviceContext, typename T>
-class OneHotV2CUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-
-    int depth = -1;
-    if (context.HasInput("depth_tensor")) {
-      auto* depth_tensor = context.Input<framework::Tensor>("depth_tensor");
-      if (platform::is_gpu_place(depth_tensor->place())) {
-        framework::Tensor temp;
-        paddle::framework::TensorCopySync(*depth_tensor, platform::CPUPlace(),
-                                          &temp);
-        depth = *temp.data<int32_t>();
-      } else {
-        depth = *depth_tensor->data<int32_t>();
-      }
-
-      auto out_dims = out->dims();
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    } else {
-      depth = context.Attr<int>("depth");
-    }
-    framework::VisitDataType(
-        static_cast<framework::proto::VarType::Type>(
-            context.Attr<int>("dtype")),
-        OneHotV2OpCUDAFunctor<DeviceContext, T>(
-            in, out, depth, context.template device_context<DeviceContext>()));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    one_hot_v2,
-    ops::OneHotV2CUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::OneHotV2CUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/one_hot_v2_op_npu.cc b/paddle/fluid/operators/one_hot_v2_op_npu.cc
index acf6baf50b418..e5702a37bb2b4 100644
--- a/paddle/fluid/operators/one_hot_v2_op_npu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_npu.cc
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/one_hot_v2_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
 
 template <typename T>
 class OneHotV2NPUKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/op_debug_string_test.cc b/paddle/fluid/operators/op_debug_string_test.cc
index b96fcaa486cce..372a71706ab5e 100644
--- a/paddle/fluid/operators/op_debug_string_test.cc
+++ b/paddle/fluid/operators/op_debug_string_test.cc
@@ -17,8 +17,10 @@
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add_grad);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
new file mode 100644
index 0000000000000..e5399ee36ba7f
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/optimizers/merged_momentum_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class MLUMergedMomentumOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto params = ctx.MultiInput<framework::Tensor>("Param");
+    auto params_out = ctx.MultiOutput<framework::Tensor>("ParamOut");
+    size_t n = params.size();
+    PADDLE_ENFORCE_EQ(n, params_out.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Output(ParamOut) must be equal to "
+                          "Input(Param), but got the size of Output(ParamOut) "
+                          "is %d, the size of Input(Param) is %d.",
+                          params_out.size(), n));
+    for (size_t i = 0; i < n; ++i) {
+      PADDLE_ENFORCE_EQ(params[i], params_out[i],
+                        platform::errors::InvalidArgument(
+                            "The size of Input(Param) and Output(ParamOut) "
+                            "must be the same Tensors."));
+    }
+
+    auto grads = ctx.MultiInput<framework::Tensor>("Grad");
+    PADDLE_ENFORCE_EQ(
+        n, grads.size(),
+        platform::errors::InvalidArgument(
+            "The size of Input(Grad) must be equal to Input(Param), but got "
+            "the size of Input(Grad) is %d, the size of Input(Param) is %d.",
+            grads.size(), n));
+
+    auto velocitys = ctx.MultiInput<framework::Tensor>("Velocity");
+    PADDLE_ENFORCE_EQ(n, velocitys.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Input(Velocity) must be equal to "
+                          "Input(Param), but got the size of Input(Velocity) "
+                          "is %d, the size of Input(Param) is %d.",
+                          velocitys.size(), n));
+
+    auto velocitys_out = ctx.MultiOutput<framework::Tensor>("VelocityOut");
+    PADDLE_ENFORCE_EQ(
+        n, velocitys_out.size(),
+        platform::errors::InvalidArgument(
+            "The size of Output(VelocityOut) must be "
+            "equal to Input(Param), but got the size of Output(VelocityOut) is "
+            "%d, the size of Input(Param) is %d.",
+            velocitys_out.size(), n));
+    for (size_t i = 0; i < n; ++i) {
+      PADDLE_ENFORCE_EQ(velocitys[i], velocitys_out[i],
+                        platform::errors::InvalidArgument(
+                            "Input(Velocity) and Output(VelocityOut) must be "
+                            "the same Tensors."));
+    }
+
+    auto mu = ctx.Attr<float>("mu");
+    auto lrs = ctx.MultiInput<framework::Tensor>("LearningRate");
+    if (lrs.size() != 1) {
+      PADDLE_ENFORCE_EQ(
+          n, lrs.size(),
+          platform::errors::InvalidArgument(
+              "If the size of Input(LearningRate) is not 1, the size of "
+              "Input(LearningRate) must be "
+              "equal to Input(Param), but got the size of Input(LearningRate) "
+              "is %d, the size of Input(Param) is %d.",
+              lrs.size(), n));
+    }
+    auto use_nesterov = ctx.Attr<bool>("use_nesterov");
+    auto regularization_methods =
+        ctx.Attr<std::vector<std::string>>("regularization_method");
+    auto regularization_coeffs =
+        ctx.Attr<std::vector<float>>("regularization_coeff");
+    if (regularization_methods.size() != 0) {
+      PADDLE_ENFORCE_EQ(
+          n, regularization_methods.size(),
+          platform::errors::InvalidArgument(
+              "The size of Attr(regularization_method) must be equal "
+              "to Input(Param), but got the size of "
+              "Attr(regularization_method) is %d, the size of Input(Param) is "
+              "%d.",
+              regularization_methods.size(), n));
+      PADDLE_ENFORCE_EQ(
+          n, regularization_coeffs.size(),
+          platform::errors::InvalidArgument(
+              "The size of Attr(regularization_coeff) must be equal "
+              "to Input(Param), but got the size of Attr(regularization_coeff) "
+              "is %d, the size of Input(Param) is %d.",
+              regularization_coeffs.size(), n));
+    }
+
+    VLOG(5) << "use_nesterov: " << use_nesterov
+            << ",  regularization_methods.size(): "
+            << regularization_methods.size()
+            << ",  regularization_coeffs.size(): "
+            << regularization_coeffs.size();
+
+    auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
+
+    Tensor mu_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+    MLUCnnlTensorDesc mu_tensor_desc(mu_tensor);
+    MLUCnnl::Fill(ctx, mu, mu_tensor_desc.get(), GetBasePtr(&mu_tensor));
+
+    for (size_t idx = 0; idx < n; ++idx) {
+      RegularizationType regularization_flag =
+          regularization_methods.size() > 0 &&
+                  regularization_methods[idx] == "l2_decay"
+              ? RegularizationType::kL2DECAY
+              : RegularizationType::kNONE;
+      T regularization_coeff = static_cast<T>(0.0);
+      if (regularization_coeffs.size() != 0) {
+        regularization_coeff = static_cast<T>(regularization_coeffs[idx]);
+      }
+
+      auto learning_rate = lrs.size() > 1 ? lrs[idx] : lrs[0];
+      auto param_out = params_out[idx];
+      auto velocity_out = velocitys_out[idx];
+
+      auto grad = grads[idx];
+      Tensor regularized_grad;
+      MLUCnnlTensorDesc param_desc(*param_out);
+      if (regularization_flag == RegularizationType::kL2DECAY) {
+        regularized_grad = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
+            param_out->dims(), dev_ctx);
+        MLUCnnlOpTensorDesc op_tensor_desc(
+            CNNL_OP_TENSOR_ADD, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
+        MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), param_desc.get(),
+                          GetBasePtr(param_out), param_desc.get(),
+                          GetBasePtr(grad), param_desc.get(),
+                          GetBasePtr(&regularized_grad), ToCnnlDataType<T>(),
+                          regularization_coeff);
+      } else {
+        regularized_grad = *grad;
+      }
+      MLUCnnl::ApplyMomentum(ctx, param_desc.get(),
+                             GetBasePtr(&regularized_grad), use_nesterov,
+                             GetBasePtr(learning_rate), GetBasePtr(&mu_tensor),
+                             GetBasePtr(param_out), GetBasePtr(velocity_out));
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_MLU_KERNEL(merged_momentum, ops::MLUMergedMomentumOpKernel<float>,
+                       ops::MLUMergedMomentumOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc
index 7b9a4ab1557bf..e4952a243262b 100644
--- a/paddle/fluid/operators/pad3d_op.cc
+++ b/paddle/fluid/operators/pad3d_op.cc
@@ -16,7 +16,9 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -24,734 +26,10 @@ namespace operators {
 
 using framework::Tensor;
 
-template <typename T>
-void ConstPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
-                         const int in_height, const int in_width,
-                         const int out_depth, const int out_height,
-                         const int out_width, const int pad_front,
-                         const int pad_top, const int pad_left, const int out_d,
-                         const int out_h, const int out_w, const T value) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
-      (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
-       in_h >= in_height || in_w >= in_width)
-          ? value
-          : in_data[in_d * in_height * in_width + in_h * in_width + in_w];
-}
-
-template <typename T>
-void ConstPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
-                         const int in_depth, const int in_height,
-                         const int in_width, const int out_depth,
-                         const int out_height, const int out_width,
-                         const int pad_front, const int pad_top,
-                         const int pad_left, const int out_d, const int out_h,
-                         const int out_w, const T value) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  if (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
-      in_h >= in_height || in_w >= in_width) {
-    for (int c = 0; c < channels; ++c) {
-      out_data[out_index + c] = value;
-    }
-  } else {
-    const int in_index =
-        (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-    for (int c = 0; c < channels; ++c) {
-      out_data[out_index + c] = in_data[in_index + c];
-    }
-  }
-}
-
-template <typename T>
-void ReflectPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
-                           const int in_height, const int in_width,
-                           const int out_depth, const int out_height,
-                           const int out_width, const int pad_front,
-                           const int pad_top, const int pad_left,
-                           const int out_d, const int out_h, const int out_w,
-                           const T value) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-
-  in_d = std::max(in_d, -in_d);                     // reflect by 0
-  in_d = std::min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
-  in_h = std::max(in_h, -in_h);                     // reflect by 0
-  in_h = std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
-  in_w = std::max(in_w, -in_w);                     // reflect by 0
-  in_w = std::min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
-
-  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
-      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
-}
-
-template <typename T>
-void ReflectPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
-                           const int in_depth, const int in_height,
-                           const int in_width, const int out_depth,
-                           const int out_height, const int out_width,
-                           const int pad_front, const int pad_top,
-                           const int pad_left, const int out_d, const int out_h,
-                           const int out_w, const T value) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-
-  in_d = std::max(in_d, -in_d);
-  in_d = std::min(in_d, 2 * in_depth - in_d - 2);
-  in_h = std::max(in_h, -in_h);
-  in_h = std::min(in_h, 2 * in_height - in_h - 2);
-  in_w = std::max(in_w, -in_w);
-  in_w = std::min(in_w, 2 * in_width - in_w - 2);
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  const int in_index =
-      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-  for (int c = 0; c < channels; ++c) {
-    out_data[out_index + c] = in_data[in_index + c];
-  }
-}
-
-template <typename T>
-void ReplicatePad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
-                             const int in_height, const int in_width,
-                             const int out_depth, const int out_height,
-                             const int out_width, const int pad_front,
-                             const int pad_top, const int pad_left,
-                             const int out_d, const int out_h, const int out_w,
-                             const T value) {
-  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
-  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
-  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
-
-  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
-      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
-}
-
-template <typename T>
-void ReplicatePad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
-                             const int in_depth, const int in_height,
-                             const int in_width, const int out_depth,
-                             const int out_height, const int out_width,
-                             const int pad_front, const int pad_top,
-                             const int pad_left, const int out_d,
-                             const int out_h, const int out_w, const T value) {
-  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
-  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
-  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  const int in_index =
-      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-  for (int c = 0; c < channels; ++c) {
-    out_data[out_index + c] = in_data[in_index + c];
-  }
-}
-
-template <typename T>
-void CircularPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
-                            const int in_height, const int in_width,
-                            const int out_depth, const int out_height,
-                            const int out_width, const int pad_front,
-                            const int pad_top, const int pad_left,
-                            const int out_d, const int out_h, const int out_w,
-                            const T value) {
-  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
-      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
-}
-
-template <typename T>
-void CircularPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
-                            const int in_depth, const int in_height,
-                            const int in_width, const int out_depth,
-                            const int out_height, const int out_width,
-                            const int pad_front, const int pad_top,
-                            const int pad_left, const int out_d,
-                            const int out_h, const int out_w, const T value) {
-  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  const int in_index =
-      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-  for (int c = 0; c < channels; ++c) {
-    out_data[out_index + c] = in_data[in_index + c];
-  }
-}
-
-template <typename T>
-void Pad3DNCDHW(const T* in_data, const int num, const int channels,
-                const int in_depth, const int in_height, const int in_width,
-                const int out_depth, const int out_height, const int out_width,
-                const int pad_front, const int pad_top, const int pad_left,
-                T value, T* out_data,
-                void (*pad_func)(const T*, T*, const int, const int, const int,
-                                 const int, const int, const int, const int,
-                                 const int, const int, const int, const int,
-                                 const int, const T)) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int out_d = 0; out_d < out_depth; ++out_d) {
-        for (int out_h = 0; out_h < out_height; ++out_h) {
-          for (int out_w = 0; out_w < out_width; ++out_w) {
-            pad_func(in_data, out_data, in_depth, in_height, in_width,
-                     out_depth, out_height, out_width, pad_front, pad_top,
-                     pad_left, out_d, out_h, out_w, value);
-          }
-        }
-      }
-      in_data += in_depth * in_height * in_width;
-      out_data += out_depth * out_height * out_width;
-    }
-  }
-}
-
-template <typename T>
-void Pad3DNDHWC(const T* in_data, const int num, const int channels,
-                const int in_depth, const int in_height, const int in_width,
-                const int out_depth, const int out_height, const int out_width,
-                const int pad_front, const int pad_top, const int pad_left,
-                T value, T* out_data,
-                void (*pad_func)(const T*, T*, const int, const int, const int,
-                                 const int, const int, const int, const int,
-                                 const int, const int, const int, const int,
-                                 const int, const int, const T)) {
-  for (int n = 0; n < num; ++n) {
-    for (int out_d = 0; out_d < out_depth; ++out_d) {
-      for (int out_h = 0; out_h < out_height; ++out_h) {
-        for (int out_w = 0; out_w < out_width; ++out_w) {
-          pad_func(in_data, out_data, channels, in_depth, in_height, in_width,
-                   out_depth, out_height, out_width, pad_front, pad_top,
-                   pad_left, out_d, out_h, out_w, value);
-        }
-      }
-    }
-    in_data += in_depth * in_height * in_width * channels;
-    out_data += out_depth * out_height * out_width * channels;
-  }
-}
-
-template <typename T>
-void ConstPad3DGradNCDHW(T* d_in_data, const T* d_out_data, const int in_depth,
-                         const int in_height, const int in_width,
-                         const int out_depth, const int out_height,
-                         const int out_width, const int pad_front,
-                         const int pad_top, const int pad_left, const int out_d,
-                         const int out_h, const int out_w) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-  if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
-        in_h >= in_height || in_w >= in_width)) {
-    d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] =
-        d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
-  }
-}
-
-template <typename T>
-void ConstPad3DGradNDHWC(T* d_in_data, const T* d_out_data, const int channels,
-                         const int in_depth, const int in_height,
-                         const int in_width, const int out_depth,
-                         const int out_height, const int out_width,
-                         const int pad_front, const int pad_top,
-                         const int pad_left, const int out_d, const int out_h,
-                         const int out_w) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
-        in_h >= in_height || in_w >= in_width)) {
-    const int in_index =
-        (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-    for (int c = 0; c < channels; ++c) {
-      d_in_data[in_index + c] = d_out_data[out_index + c];
-    }
-  }
-}
-
-template <typename T>
-void ReflectPad3DGradNCDHW(T* d_in_data, const T* d_out_data,
-                           const int in_depth, const int in_height,
-                           const int in_width, const int out_depth,
-                           const int out_height, const int out_width,
-                           const int pad_front, const int pad_top,
-                           const int pad_left, const int out_d, const int out_h,
-                           const int out_w) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-
-  in_d = std::max(in_d, -in_d);                     // reflect by 0
-  in_d = std::min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
-  in_h = std::max(in_h, -in_h);                     // reflect by 0
-  in_h = std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
-  in_w = std::max(in_w, -in_w);                     // reflect by 0
-  in_w = std::min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
-
-  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
-      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
-}
-
-template <typename T>
-void ReflectPad3DGradNDHWC(T* d_in_data, const T* d_out_data,
-                           const int channels, const int in_depth,
-                           const int in_height, const int in_width,
-                           const int out_depth, const int out_height,
-                           const int out_width, const int pad_front,
-                           const int pad_top, const int pad_left,
-                           const int out_d, const int out_h, const int out_w) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-
-  in_d = std::max(in_d, -in_d);
-  in_d = std::min(in_d, 2 * in_depth - in_d - 2);
-  in_h = std::max(in_h, -in_h);
-  in_h = std::min(in_h, 2 * in_height - in_h - 2);
-  in_w = std::max(in_w, -in_w);
-  in_w = std::min(in_w, 2 * in_width - in_w - 2);
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  const int in_index =
-      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-  for (int c = 0; c < channels; ++c) {
-    d_in_data[in_index + c] += d_out_data[out_index + c];
-  }
-}
-
-template <typename T>
-void ReplicatePad3DGradNCDHW(T* d_in_data, const T* d_out_data,
-                             const int in_depth, const int in_height,
-                             const int in_width, const int out_depth,
-                             const int out_height, const int out_width,
-                             const int pad_front, const int pad_top,
-                             const int pad_left, const int out_d,
-                             const int out_h, const int out_w) {
-  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
-  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
-  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
-
-  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
-      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
-}
-
-template <typename T>
-void ReplicatePad3DGradNDHWC(T* d_in_data, const T* d_out_data,
-                             const int channels, const int in_depth,
-                             const int in_height, const int in_width,
-                             const int out_depth, const int out_height,
-                             const int out_width, const int pad_front,
-                             const int pad_top, const int pad_left,
-                             const int out_d, const int out_h,
-                             const int out_w) {
-  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
-  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
-  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  const int in_index =
-      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-  for (int c = 0; c < channels; ++c) {
-    d_in_data[in_index + c] += d_out_data[out_index + c];
-  }
-}
-
-template <typename T>
-void CircularPad3DGradNCDHW(T* d_in_data, const T* d_out_data,
-                            const int in_depth, const int in_height,
-                            const int in_width, const int out_depth,
-                            const int out_height, const int out_width,
-                            const int pad_front, const int pad_top,
-                            const int pad_left, const int out_d,
-                            const int out_h, const int out_w) {
-  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
-      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
-}
-
-template <typename T>
-void CircularPad3DGradNDHWC(T* d_in_data, const T* d_out_data,
-                            const int channels, const int in_depth,
-                            const int in_height, const int in_width,
-                            const int out_depth, const int out_height,
-                            const int out_width, const int pad_front,
-                            const int pad_top, const int pad_left,
-                            const int out_d, const int out_h, const int out_w) {
-  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  const int in_index =
-      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-  for (int c = 0; c < channels; ++c) {
-    d_in_data[in_index + c] += d_out_data[out_index + c];
-  }
-}
-
-template <typename T>
-void Pad3DGradNCDHW(T* d_in_data, const int num, const int channels,
-                    const int in_depth, const int in_height, const int in_width,
-                    const int out_depth, const int out_height,
-                    const int out_width, const int pad_front, const int pad_top,
-                    const int pad_left, const T* d_out_data,
-                    void (*pad_func)(T*, const T*, const int, const int,
-                                     const int, const int, const int, const int,
-                                     const int, const int, const int, const int,
-                                     const int, const int)) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int out_d = 0; out_d < out_depth; ++out_d) {
-        for (int out_h = 0; out_h < out_height; ++out_h) {
-          for (int out_w = 0; out_w < out_width; ++out_w) {
-            pad_func(d_in_data, d_out_data, in_depth, in_height, in_width,
-                     out_depth, out_height, out_width, pad_front, pad_top,
-                     pad_left, out_d, out_h, out_w);
-          }
-        }
-      }
-      d_in_data += in_depth * in_height * in_width;
-      d_out_data += out_depth * out_height * out_width;
-    }
-  }
-}
-
-template <typename T>
-void Pad3DGradNDHWC(T* d_in_data, const int num, const int channels,
-                    const int in_depth, const int in_height, const int in_width,
-                    const int out_depth, const int out_height,
-                    const int out_width, const int pad_front, const int pad_top,
-                    const int pad_left, const T* d_out_data,
-                    void (*pad_func)(T*, const T*, const int, const int,
-                                     const int, const int, const int, const int,
-                                     const int, const int, const int, const int,
-                                     const int, const int, const int)) {
-  for (int n = 0; n < num; ++n) {
-    for (int out_d = 0; out_d < out_depth; ++out_d) {
-      for (int out_h = 0; out_h < out_height; ++out_h) {
-        for (int out_w = 0; out_w < out_width; ++out_w) {
-          pad_func(d_in_data, d_out_data, channels, in_depth, in_height,
-                   in_width, out_depth, out_height, out_width, pad_front,
-                   pad_top, pad_left, out_d, out_h, out_w);
-        }
-      }
-    }
-    d_in_data += in_depth * in_height * in_width * channels;
-    d_out_data += out_depth * out_height * out_width * channels;
-  }
-}
-
-static inline std::vector<int> GetPaddings(
-    const framework::ExecutionContext& context) {
-  std::vector<int> paddings(6);
-  auto* paddings_t = context.Input<Tensor>("Paddings");
-  if (paddings_t) {
-    auto paddings_data = paddings_t->data<int>();
-    std::memcpy(paddings.data(), paddings_data, paddings.size() * sizeof(int));
-  } else {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    std::copy(pads.begin(), pads.end(), paddings.data());
-  }
-  return paddings;
-}
-
-template <typename T>
-class Pad3dCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::vector<int> pads = GetPaddings(context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-    T value = static_cast<T>(context.Attr<float>("value"));
-
-    auto* x = context.Input<Tensor>("X");
-    auto in_dims = x->dims();
-    const T* in_data = x->data<T>();
-
-    auto* out = context.Output<Tensor>("Out");
-    if (data_format == "NCDHW") {
-      out->Resize({in_dims[0], in_dims[1], in_dims[2] + pads[4] + pads[5],
-                   in_dims[3] + pads[2] + pads[3],
-                   in_dims[4] + pads[0] + pads[1]});
-    } else {
-      out->Resize({in_dims[0], in_dims[1] + pads[4] + pads[5],
-                   in_dims[2] + pads[2] + pads[3],
-                   in_dims[3] + pads[0] + pads[1], in_dims[4]});
-    }
-    auto out_dims = out->dims();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    int channels = in_dims[1];
-    int in_depth = in_dims[2];
-    int in_height = in_dims[3];
-    int in_width = in_dims[4];
-    int out_depth = out_dims[2];
-    int out_height = out_dims[3];
-    int out_width = out_dims[4];
-    if (data_format == "NDHWC") {
-      channels = in_dims[4];
-      in_depth = in_dims[1];
-      in_height = in_dims[2];
-      in_width = in_dims[3];
-      out_depth = out_dims[1];
-      out_height = out_dims[2];
-      out_width = out_dims[3];
-    }
-
-    if (mode == "reflect") {
-      PADDLE_ENFORCE_GT(in_depth, pads[4],
-                        platform::errors::InvalidArgument(
-                            "The depth of Input(X)'s dimension should be "
-                            "greater than pad_front"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_front(%d).",
-                            in_depth, pads[4]));
-      PADDLE_ENFORCE_GT(in_depth, pads[5],
-                        platform::errors::InvalidArgument(
-                            "The depth of Input(X)'s dimension should be "
-                            "greater than pad_back"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_back(%d).",
-                            in_depth, pads[5]));
-
-      PADDLE_ENFORCE_GT(in_height, pads[2],
-                        platform::errors::InvalidArgument(
-                            "The height of Input(X)'s dimension should be "
-                            "greater than pad_top"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_top(%d).",
-                            in_height, pads[2]));
-      PADDLE_ENFORCE_GT(in_height, pads[3],
-                        platform::errors::InvalidArgument(
-                            "The height of Input(X)'s dimension should be "
-                            "greater than pad_bottom"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_bottom(%d).",
-                            in_height, pads[3]));
-
-      PADDLE_ENFORCE_GT(in_width, pads[0],
-                        platform::errors::InvalidArgument(
-                            "The width of Input(X)'s dimension should be "
-                            "greater than pad_left"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_left(%d).",
-                            in_width, pads[0]));
-      PADDLE_ENFORCE_GT(in_width, pads[1],
-                        platform::errors::InvalidArgument(
-                            "The width of Input(X)'s dimension should be "
-                            "greater than pad_right"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_right(%d).",
-                            in_width, pads[1]));
-    } else if (mode == "circular" || mode == "replicate") {
-      PADDLE_ENFORCE_NE(in_depth * in_height * in_width, 0,
-                        platform::errors::InvalidArgument(
-                            "The input tensor size can not be 0 for circular "
-                            "or replicate padding mode."));
-    }
-
-    const int pad_left = pads[0];
-    const int pad_top = pads[2];
-    const int pad_front = pads[4];
-    const int num = in_dims[0];
-    if (data_format == "NCDHW") {
-      std::map<std::string,
-               void (*)(const T*, T*, const int, const int, const int,
-                        const int, const int, const int, const int, const int,
-                        const int, const int, const int, const int, const T)>
-          func_map;
-
-      func_map["reflect"] = ReflectPad3DFuncNCDHW;
-      func_map["replicate"] = ReplicatePad3DFuncNCDHW;
-      func_map["circular"] = CircularPad3DFuncNCDHW;
-      func_map["constant"] = ConstPad3DFuncNCDHW;
-      Pad3DNCDHW(in_data, num, channels, in_depth, in_height, in_width,
-                 out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-                 value, out_data, func_map[mode]);
-    } else {
-      std::map<std::string, void (*)(const T*, T*, const int, const int,
-                                     const int, const int, const int, const int,
-                                     const int, const int, const int, const int,
-                                     const int, const int, const int, const T)>
-          func_map;
-
-      func_map["reflect"] = ReflectPad3DFuncNDHWC;
-      func_map["replicate"] = ReplicatePad3DFuncNDHWC;
-      func_map["circular"] = CircularPad3DFuncNDHWC;
-      func_map["constant"] = ConstPad3DFuncNDHWC;
-      Pad3DNDHWC(in_data, num, channels, in_depth, in_height, in_width,
-                 out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-                 value, out_data, func_map[mode]);
-    }
-  }
-};
-
-template <typename T>
-class Pad3dGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::vector<int> pads = GetPaddings(context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
-    auto d_in_dims = d_in->dims();
-    auto d_out_dims = d_out->dims();
-    const T* d_out_data = d_out->data<T>();
-    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
-    set_zero(context.template device_context<platform::CPUDeviceContext>(),
-             d_in, static_cast<T>(0));
-    const int pad_left = pads[0];
-    const int pad_top = pads[2];
-    const int pad_front = pads[4];
-    const int num = d_in_dims[0];
-    if (data_format == "NCDHW") {
-      const int channels = d_in_dims[1];
-      const int in_depth = d_in_dims[2];
-      const int in_height = d_in_dims[3];
-      const int in_width = d_in_dims[4];
-      const int out_depth = d_out_dims[2];
-      const int out_height = d_out_dims[3];
-      const int out_width = d_out_dims[4];
-
-      std::map<std::string,
-               void (*)(T*, const T*, const int, const int, const int,
-                        const int, const int, const int, const int, const int,
-                        const int, const int, const int, const int)>
-          func_map;
-
-      func_map["reflect"] = ReflectPad3DGradNCDHW;
-      func_map["replicate"] = ReplicatePad3DGradNCDHW;
-      func_map["circular"] = CircularPad3DGradNCDHW;
-      func_map["constant"] = ConstPad3DGradNCDHW;
-
-      Pad3DGradNCDHW(d_in_data, num, channels, in_depth, in_height, in_width,
-                     out_depth, out_height, out_width, pad_front, pad_top,
-                     pad_left, d_out_data, func_map[mode]);
-    } else {
-      const int channels = d_in_dims[4];
-      const int in_depth = d_in_dims[1];
-      const int in_height = d_in_dims[2];
-      const int in_width = d_in_dims[3];
-      const int out_depth = d_out_dims[1];
-      const int out_height = d_out_dims[2];
-      const int out_width = d_out_dims[3];
-
-      std::map<std::string,
-               void (*)(T*, const T*, const int, const int, const int,
-                        const int, const int, const int, const int, const int,
-                        const int, const int, const int, const int, const int)>
-          func_map;
-
-      func_map["reflect"] = ReflectPad3DGradNDHWC;
-      func_map["replicate"] = ReplicatePad3DGradNDHWC;
-      func_map["circular"] = CircularPad3DGradNDHWC;
-      func_map["constant"] = ConstPad3DGradNDHWC;
-
-      Pad3DGradNDHWC(d_in_data, num, channels, in_depth, in_height, in_width,
-                     out_depth, out_height, out_width, pad_front, pad_top,
-                     pad_left, d_out_data, func_map[mode]);
-    }
-  }
-};
-
 class Pad3dOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad3d");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Pad3d");
-
-    auto x_dim = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(x_dim.size(), 5,
-                      platform::errors::InvalidArgument(
-                          "The size of Input(X)'s dimension should be equal to "
-                          "5, but received %d. ",
-                          x_dim.size()));
-
-    std::vector<int64_t> out_dims(x_dim.size());
-    auto data_format = ctx->Attrs().Get<std::string>("data_format");
-    out_dims[0] = x_dim[0];
-    if (ctx->HasInput("Paddings")) {
-      auto paddings_dim = ctx->GetInputDim("Paddings");
-      PADDLE_ENFORCE_EQ(paddings_dim.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "Size of Input(Paddings)'s dimension should be "
-                            "equal to 1, but received %d.",
-                            paddings_dim.size()));
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(paddings_dim[0], 6,
-                          platform::errors::InvalidArgument(
-                              "Shape of Input(Paddings) should be equal to "
-                              "[6], but received [%d].",
-                              paddings_dim[0]));
-      }
-      out_dims[1] = x_dim[1];
-      out_dims[2] = x_dim[2];
-      out_dims[3] = x_dim[3];
-    } else {
-      auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-      PADDLE_ENFORCE_EQ(
-          paddings.size(), 6,
-          platform::errors::InvalidArgument(
-              "Size of paddings should be equal to 4, but received %d.",
-              static_cast<int>(paddings.size())));
-      if (data_format == "NCDHW") {
-        out_dims[1] = x_dim[1];  // channel
-        out_dims[2] = ((!ctx->IsRuntime()) && (x_dim[2] < 0))
-                          ? x_dim[2]
-                          : (x_dim[2] + paddings[4] + paddings[5]);  // depth
-
-        out_dims[3] = ((!ctx->IsRuntime()) && (x_dim[3] < 0))
-                          ? x_dim[3]
-                          : (x_dim[3] + paddings[2] + paddings[3]);  // height
-
-        out_dims[4] = ((!ctx->IsRuntime()) && (x_dim[4] < 0))
-                          ? x_dim[4]
-                          : (x_dim[4] + paddings[0] + paddings[1]);  // width
-      } else {                                                       // NDHWC
-        out_dims[4] = x_dim[4];                                      // channel
-
-        out_dims[1] = ((!ctx->IsRuntime()) && (x_dim[1] < 0))
-                          ? x_dim[1]
-                          : (x_dim[1] + paddings[4] + paddings[5]);  // depth
-        out_dims[2] = ((!ctx->IsRuntime()) && (x_dim[2] < 0))
-                          ? x_dim[2]
-                          : (x_dim[2] + paddings[2] + paddings[3]);  // height
-        out_dims[3] = ((!ctx->IsRuntime()) && (x_dim[3] < 0))
-                          ? x_dim[3]
-                          : (x_dim[3] + paddings[0] + paddings[1]);  // width
-      }
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -921,15 +199,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(Pad3dOpGradNoNeedBufferVarsInferer, "X");
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(pad3d, Pad3dInferShapeFunctor,
+                            PD_INFER_META(phi::Pad3dInferMeta));
+
 REGISTER_OPERATOR(pad3d, ops::Pad3dOp, ops::Pad3dOpMaker,
                   ops::Pad3dOpGradMaker<paddle::framework::OpDesc>,
-                  ops::Pad3dOpGradMaker<paddle::imperative::OpBase>);
+                  ops::Pad3dOpGradMaker<paddle::imperative::OpBase>,
+                  Pad3dInferShapeFunctor);
 REGISTER_OPERATOR(pad3d_grad, ops::Pad3dOpGrad,
                   ops::Pad3dOpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::Pad3dOpDoubleGradMaker<paddle::imperative::OpBase>,
                   ops::Pad3dOpGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(pad3d, ops::Pad3dCPUKernel<float>,
-                       ops::Pad3dCPUKernel<double>, ops::Pad3dCPUKernel<int>,
-                       ops::Pad3dCPUKernel<int64_t>);
-REGISTER_OP_CPU_KERNEL(pad3d_grad, ops::Pad3dGradCPUKernel<float>,
-                       ops::Pad3dGradCPUKernel<double>);
diff --git a/paddle/fluid/operators/pad3d_op.cu b/paddle/fluid/operators/pad3d_op.cu
deleted file mode 100644
index 9ab0eb9d445da..0000000000000
--- a/paddle/fluid/operators/pad3d_op.cu
+++ /dev/null
@@ -1,793 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-using framework::Tensor;
-
-template <typename T>
-__global__ void Pad3DConstNCDHW(const int nthreads, const T* in_data,
-                                const int num, const int channels,
-                                const int in_depth, const int in_height,
-                                const int in_width, const int out_depth,
-                                const int out_height, const int out_width,
-                                const int pad_front, const int pad_top,
-                                const int pad_left, T value, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int nc = index / out_width;
-
-    const int out_w = index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    int in_d = out_d - pad_front;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-    out_data[index] =
-        (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
-         in_h >= in_height || in_w >= in_width)
-            ? value
-            : in_data[nc * in_depth * in_height * in_width +
-                      in_d * in_height * in_width + in_h * in_width + in_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DConstNDHWC(const int nthreads, const T* in_data,
-                                const int num, const int channels,
-                                const int in_depth, const int in_height,
-                                const int in_width, const int out_depth,
-                                const int out_height, const int out_width,
-                                const int pad_front, const int pad_top,
-                                const int pad_left, T value, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int n = index / channels;
-    const int c = index % channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-    const int in_d = out_d - pad_front;
-    const int in_h = out_h - pad_top;
-    const int in_w = out_w - pad_left;
-
-    out_data[index] =
-        (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
-         in_h >= in_height || in_w >= in_width)
-            ? value
-            : in_data[n * in_depth * in_height * in_width * channels +
-                      in_d * in_height * in_width * channels +
-                      in_h * in_width * channels + in_w * channels + c];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DReflectNCDHW(const int nthreads, const T* in_data,
-                                  const int num, const int channels,
-                                  const int in_depth, const int in_height,
-                                  const int in_width, const int out_depth,
-                                  const int out_height, const int out_width,
-                                  const int pad_front, const int pad_top,
-                                  const int pad_left, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int nc = index / out_width;
-
-    const int out_w = index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    int in_d = out_d - pad_front;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-
-    in_d = max(in_d, -in_d);                     // reflect by 0
-    in_d = min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
-    in_h = max(in_h, -in_h);                     // reflect by 0
-    in_h = min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
-    in_w = max(in_w, -in_w);                     // reflect by 0
-    in_w = min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
-    out_data[index] =
-        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
-                    in_width +
-                in_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DReflectNDHWC(const int nthreads, const T* in_data,
-                                  const int num, const int channels,
-                                  const int in_depth, const int in_height,
-                                  const int in_width, const int out_depth,
-                                  const int out_height, const int out_width,
-                                  const int pad_front, const int pad_top,
-                                  const int pad_left, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int n = index / channels;
-    const int c = index % channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-    int in_d = out_d - pad_front;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-
-    in_d = max(in_d, -in_d);
-    in_d = min(in_d, 2 * in_depth - in_d - 2);
-    in_h = max(in_h, -in_h);
-    in_h = min(in_h, 2 * in_height - in_h - 2);
-    in_w = max(in_w, -in_w);
-    in_w = min(in_w, 2 * in_width - in_w - 2);
-
-    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
-                              in_d * in_height * in_width * channels +
-                              in_h * in_width * channels + in_w * channels + c];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DReplicateNCDHW(const int nthreads, const T* in_data,
-                                    const int num, const int channels,
-                                    const int in_depth, const int in_height,
-                                    const int in_width, const int out_depth,
-                                    const int out_height, const int out_width,
-                                    const int pad_front, const int pad_top,
-                                    const int pad_left, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int nc = index / out_width;
-
-    const int out_w = index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
-    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
-    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-
-    out_data[index] =
-        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
-                    in_width +
-                in_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DReplicateNDHWC(const int nthreads, const T* in_data,
-                                    const int num, const int channels,
-                                    const int in_depth, const int in_height,
-                                    const int in_width, const int out_depth,
-                                    const int out_height, const int out_width,
-                                    const int pad_front, const int pad_top,
-                                    const int pad_left, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int n = index / channels;
-    const int c = index % channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-
-    int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
-    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
-    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-
-    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
-                              in_d * in_height * in_width * channels +
-                              in_h * in_width * channels + in_w * channels + c];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DCircularNCDHW(const int nthreads, const T* in_data,
-                                   const int num, const int channels,
-                                   const int in_depth, const int in_height,
-                                   const int in_width, const int out_depth,
-                                   const int out_height, const int out_width,
-                                   const int pad_front, const int pad_top,
-                                   const int pad_left, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int nc = index / out_width;
-
-    const int out_w = index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-    out_data[index] =
-        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
-                    in_width +
-                in_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DCircularNDHWC(const int nthreads, const T* in_data,
-                                   const int num, const int channels,
-                                   const int in_depth, const int in_height,
-                                   const int in_width, const int out_depth,
-                                   const int out_height, const int out_width,
-                                   const int pad_front, const int pad_top,
-                                   const int pad_left, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int n = index / channels;
-    const int c = index % channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-
-    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
-                              in_d * in_height * in_width * channels +
-                              in_h * in_width * channels + in_w * channels + c];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradConstNCDHW(const int in_size, T* d_in_data,
-                                    const int num, const int channels,
-                                    const int in_depth, const int in_height,
-                                    const int in_width, const int out_depth,
-                                    const int out_height, const int out_width,
-                                    const int pad_front, const int pad_top,
-                                    const int pad_left, const T* d_out_data) {
-  CUDA_KERNEL_LOOP(in_index, in_size) {
-    const int in_w = in_index % in_width;
-
-    int nc = in_index / in_width;
-    const int in_h = nc % in_height;
-
-    nc /= in_height;
-    const int in_d = nc % in_depth;
-
-    nc /= in_depth;
-
-    const int out_d = in_d + pad_front;
-    const int out_h = in_h + pad_top;
-    const int out_w = in_w + pad_left;
-    d_in_data[in_index] =
-        d_out_data[nc * out_depth * out_height * out_width +
-                   out_d * out_height * out_width + out_h * out_width + out_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradConstNDHWC(const int in_size, T* d_in_data,
-                                    const int num, const int channels,
-                                    const int in_depth, const int in_height,
-                                    const int in_width, const int out_depth,
-                                    const int out_height, const int out_width,
-                                    const int pad_front, const int pad_top,
-                                    const int pad_left, const T* d_out_data) {
-  CUDA_KERNEL_LOOP(in_index, in_size) {
-    const int c = in_index % channels;
-    int n = in_index / channels;
-
-    const int in_w = n % in_width;
-    n /= in_width;
-
-    const int in_h = n % in_height;
-    n /= in_height;
-
-    const int in_d = n % in_depth;
-    n /= in_depth;
-
-    const int out_d = in_d + pad_front;
-    const int out_h = in_h + pad_top;
-    const int out_w = in_w + pad_left;
-
-    d_in_data[in_index] =
-        d_out_data[n * out_depth * out_height * out_width * channels +
-                   out_d * out_height * out_width * channels +
-                   out_h * out_width * channels + out_w * channels + c];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradReflectNCDHW(const int out_size, T* d_in_data,
-                                      const int num, const int channels,
-                                      const int in_depth, const int in_height,
-                                      const int in_width, const int out_depth,
-                                      const int out_height, const int out_width,
-                                      const int pad_front, const int pad_top,
-                                      const int pad_left, const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    int nc = out_index / out_width;
-    const int out_w = out_index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    int in_d = out_d - pad_front;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-
-    in_d = max(in_d, -in_d);
-    in_h = max(in_h, -in_h);
-    in_w = max(in_w, -in_w);
-
-    in_d = min(in_d, 2 * in_depth - in_d - 2);
-    in_h = min(in_h, 2 * in_height - in_h - 2);
-    in_w = min(in_w, 2 * in_width - in_w - 2);
-
-    platform::CudaAtomicAdd(
-        &d_in_data[nc * in_depth * in_height * in_width +
-                   in_d * in_height * in_width + in_h * in_width + in_w],
-        d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradReflectNDHWC(const int out_size, T* d_in_data,
-                                      const int num, const int channels,
-                                      const int in_depth, const int in_height,
-                                      const int in_width, const int out_depth,
-                                      const int out_height, const int out_width,
-                                      const int pad_front, const int pad_top,
-                                      const int pad_left, const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    const int c = out_index % channels;
-    int n = out_index / channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-
-    int in_d = out_d - pad_front;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-
-    in_d = max(in_d, -in_d);
-    in_h = max(in_h, -in_h);
-    in_w = max(in_w, -in_w);
-
-    in_d = min(in_d, in_depth * 2 - in_d - 2);
-    in_h = min(in_h, in_height * 2 - in_h - 2);
-    in_w = min(in_w, in_width * 2 - in_w - 2);
-    platform::CudaAtomicAdd(
-        &d_in_data[n * in_depth * in_height * in_width * channels +
-                   in_d * in_height * in_width * channels +
-                   in_h * in_width * channels + in_w * channels + c],
-        d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradReplicateNCDHW(
-    const int out_size, T* d_in_data, const int num, const int channels,
-    const int in_depth, const int in_height, const int in_width,
-    const int out_depth, const int out_height, const int out_width,
-    const int pad_front, const int pad_top, const int pad_left,
-    const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    int nc = out_index / out_width;
-    const int out_w = out_index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    const int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
-    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
-    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-
-    platform::CudaAtomicAdd(
-        &d_in_data[nc * in_depth * in_height * in_width +
-                   in_d * in_height * in_width + in_h * in_width + in_w],
-        d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradReplicateNDHWC(
-    const int out_size, T* d_in_data, const int num, const int channels,
-    const int in_depth, const int in_height, const int in_width,
-    const int out_depth, const int out_height, const int out_width,
-    const int pad_front, const int pad_top, const int pad_left,
-    const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    const int c = out_index % channels;
-    int n = out_index / channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-
-    const int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
-    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
-    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-
-    platform::CudaAtomicAdd(
-        &d_in_data[n * in_depth * in_height * in_width * channels +
-                   in_d * in_height * in_width * channels +
-                   in_h * in_width * channels + in_w * channels + c],
-        d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradCircularNCDHW(const int out_size, T* d_in_data,
-                                       const int num, const int channels,
-                                       const int in_depth, const int in_height,
-                                       const int in_width, const int out_depth,
-                                       const int out_height,
-                                       const int out_width, const int pad_front,
-                                       const int pad_top, const int pad_left,
-                                       const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    int nc = out_index / out_width;
-    const int out_w = out_index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-    platform::CudaAtomicAdd(
-        &d_in_data[nc * in_depth * in_height * in_width +
-                   in_d * in_height * in_width + in_h * in_width + in_w],
-        d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradCircularNDHWC(const int out_size, T* d_in_data,
-                                       const int num, const int channels,
-                                       const int in_depth, const int in_height,
-                                       const int in_width, const int out_depth,
-                                       const int out_height,
-                                       const int out_width, const int pad_front,
-                                       const int pad_top, const int pad_left,
-                                       const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    const int c = out_index % channels;
-    int n = out_index / channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-
-    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-    platform::CudaAtomicAdd(
-        &d_in_data[n * in_depth * in_height * in_width * channels +
-                   in_d * in_height * in_width * channels +
-                   in_h * in_width * channels + in_w * channels + c],
-        d_out_data[out_index]);
-  }
-}
-
-static inline std::vector<int> GetPaddings(
-    const framework::ExecutionContext& context) {
-  std::vector<int> paddings(6);
-  auto* paddings_data = context.Input<Tensor>("Paddings");
-  if (paddings_data) {
-    Tensor pads;
-    framework::TensorCopySync(*paddings_data, platform::CPUPlace(), &pads);
-    auto pads_data = pads.data<int>();
-    std::memcpy(paddings.data(), pads_data, paddings.size() * sizeof(int));
-  } else {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    std::copy(pads.begin(), pads.end(), paddings.data());
-  }
-  return paddings;
-}
-
-template <typename T>
-class Pad3dCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::vector<int> pads = GetPaddings(context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-    T value = static_cast<T>(context.Attr<float>("value"));
-
-    auto* x = context.Input<Tensor>("X");
-    auto in_dims = x->dims();
-    const T* in_data = x->data<T>();
-    auto* out = context.Output<Tensor>("Out");
-    auto out_dims = out->dims();
-    if (data_format == "NCDHW") {
-      out_dims[0] = in_dims[0];
-      out_dims[1] = in_dims[1];
-      out_dims[2] = in_dims[2] + pads[4] + pads[5];
-      out_dims[3] = in_dims[3] + pads[2] + pads[3];
-      out_dims[4] = in_dims[4] + pads[0] + pads[1];
-    } else {
-      out_dims[0] = in_dims[0];
-      out_dims[1] = in_dims[1] + pads[4] + pads[5];
-      out_dims[2] = in_dims[2] + pads[2] + pads[3];
-      out_dims[3] = in_dims[3] + pads[0] + pads[1];
-      out_dims[4] = in_dims[4];
-    }
-    T* out_data = out->mutable_data<T>(out_dims, context.GetPlace());
-
-    int channels = in_dims[1];
-    int in_depth = in_dims[2];
-    int in_height = in_dims[3];
-    int in_width = in_dims[4];
-    int out_depth = out_dims[2];
-    int out_height = out_dims[3];
-    int out_width = out_dims[4];
-    if (data_format == "NDHWC") {
-      channels = in_dims[4];
-      in_depth = in_dims[1];
-      in_height = in_dims[2];
-      in_width = in_dims[3];
-      out_depth = out_dims[1];
-      out_height = out_dims[2];
-      out_width = out_dims[3];
-    }
-
-    if (mode == "reflect") {
-      PADDLE_ENFORCE_GT(in_depth, pads[4],
-                        platform::errors::InvalidArgument(
-                            "The depth of Input(X)'s dimension should be "
-                            "greater than pad_front"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_front(%d).",
-                            in_depth, pads[4]));
-      PADDLE_ENFORCE_GT(in_depth, pads[5],
-                        platform::errors::InvalidArgument(
-                            "The depth of Input(X)'s dimension should be "
-                            "greater than pad_back"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_back(%d).",
-                            in_depth, pads[5]));
-
-      PADDLE_ENFORCE_GT(in_height, pads[2],
-                        platform::errors::InvalidArgument(
-                            "The height of Input(X)'s dimension should be "
-                            "greater than pad_top"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_top(%d).",
-                            in_height, pads[2]));
-      PADDLE_ENFORCE_GT(in_height, pads[3],
-                        platform::errors::InvalidArgument(
-                            "The height of Input(X)'s dimension should be "
-                            "greater than pad_bottom"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_bottom(%d).",
-                            in_height, pads[3]));
-
-      PADDLE_ENFORCE_GT(in_width, pads[0],
-                        platform::errors::InvalidArgument(
-                            "The width of Input(X)'s dimension should be "
-                            "greater than pad_left"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_left(%d).",
-                            in_width, pads[0]));
-      PADDLE_ENFORCE_GT(in_width, pads[1],
-                        platform::errors::InvalidArgument(
-                            "The width of Input(X)'s dimension should be "
-                            "greater than pad_right"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_right(%d).",
-                            in_width, pads[1]));
-    } else if (mode == "circular" || mode == "replicate") {
-      PADDLE_ENFORCE_NE(in_depth * in_height * in_width, 0,
-                        platform::errors::InvalidArgument(
-                            "The input tensor size can not be 0 for circular "
-                            "or replicate padding mode."));
-    }
-
-    const int pad_left = pads[0];
-    const int pad_top = pads[2];
-    const int pad_front = pads[4];
-    const int num = in_dims[0];
-
-    auto stream = context.cuda_device_context().stream();
-    int block = PADDLE_CUDA_NUM_THREADS;
-    const int out_size = out->numel();
-    int grid = (out_size + block - 1) / block;
-
-    if (data_format == "NCDHW") {
-      if (mode == "reflect") {
-        Pad3DReflectNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            out_data);
-      } else if (mode == "replicate") {
-        Pad3DReplicateNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            out_data);
-      } else if (mode == "circular") {
-        Pad3DCircularNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            out_data);
-      } else {
-        Pad3DConstNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            value, out_data);
-      }
-    } else {
-      if (mode == "reflect") {
-        Pad3DReflectNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            out_data);
-      } else if (mode == "replicate") {
-        Pad3DReplicateNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            out_data);
-      } else if (mode == "circular") {
-        Pad3DCircularNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            out_data);
-      } else {
-        Pad3DConstNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            value, out_data);
-      }
-    }
-  }
-};
-
-template <typename T>
-class Pad3dGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::vector<int> pads = GetPaddings(context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
-    auto d_in_dims = d_in->dims();
-    auto d_out_dims = d_out->dims();
-    const T* d_out_data = d_out->data<T>();
-    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
-
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    set_zero(context.template device_context<platform::CUDADeviceContext>(),
-             d_in, static_cast<T>(0));
-
-    const int pad_left = pads[0];
-    const int pad_top = pads[2];
-    const int pad_front = pads[4];
-
-    const int num = d_in_dims[0];
-
-    auto stream = context.cuda_device_context().stream();
-    int block = PADDLE_CUDA_NUM_THREADS;
-    const int out_size = d_out->numel();
-    const int in_size = d_in->numel();
-    int grid = (out_size + block - 1) / block;
-
-    if (data_format == "NCDHW") {
-      const int channels = d_in_dims[1];
-      const int in_depth = d_in_dims[2];
-      const int in_height = d_in_dims[3];
-      const int in_width = d_in_dims[4];
-      const int out_depth = d_out_dims[2];
-      const int out_height = d_out_dims[3];
-      const int out_width = d_out_dims[4];
-
-      if (mode == "reflect") {
-        Pad3DGradReflectNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      } else if (mode == "replicate") {
-        Pad3DGradReplicateNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      } else if (mode == "circular") {
-        Pad3DGradCircularNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      } else {
-        grid = (in_size + block - 1) / block;
-        Pad3DGradConstNCDHW<T><<<grid, block, 0, stream>>>(
-            in_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      }
-    } else {
-      const int channels = d_in_dims[4];
-      const int in_depth = d_in_dims[1];
-      const int in_height = d_in_dims[2];
-      const int in_width = d_in_dims[3];
-      const int out_depth = d_out_dims[1];
-      const int out_height = d_out_dims[2];
-      const int out_width = d_out_dims[3];
-      if (mode == "reflect") {
-        Pad3DGradReflectNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      } else if (mode == "replicate") {
-        Pad3DGradReplicateNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      } else if (mode == "circular") {
-        Pad3DGradCircularNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      } else {
-        grid = (in_size + block - 1) / block;
-        Pad3DGradConstNDHWC<T><<<grid, block, 0, stream>>>(
-            in_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(pad3d, ops::Pad3dCUDAKernel<plat::float16>,
-                        ops::Pad3dCUDAKernel<float>,
-                        ops::Pad3dCUDAKernel<double>, ops::Pad3dCUDAKernel<int>,
-                        ops::Pad3dCUDAKernel<int64_t>);
-REGISTER_OP_CUDA_KERNEL(pad3d_grad, ops::Pad3dGradCUDAKernel<plat::float16>,
-                        ops::Pad3dGradCUDAKernel<float>,
-                        ops::Pad3dGradCUDAKernel<double>);
diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
index 229e61ac9fe79..dc162ae5782f2 100644
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/complex.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -28,37 +30,6 @@ class PadOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Pad");
-
-    auto x_dim = ctx->GetInputDim("X");
-    auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    PADDLE_ENFORCE_EQ(
-        static_cast<int>(paddings.size()), x_dim.size() * 2,
-        platform::errors::InvalidArgument(
-            "Size of 'paddings' dimension should be equal to 2 * size of "
-            "Input(X)'s dimension, but received (size of 'paddings' dimension "
-            "is) %d vs (2 * size of Input(X)'s dimension is) %d.",
-            static_cast<int>(paddings.size()), x_dim.size() * 2));
-    for (size_t i = 0; i < paddings.size(); ++i) {
-      PADDLE_ENFORCE_GE(paddings[i], 0,
-                        platform::errors::InvalidArgument(
-                            "The element of 'paddings' should >= 0, but "
-                            "received %d for index %d.",
-                            paddings[i], static_cast<int>(i)));
-    }
-    std::vector<int64_t> out_dims(x_dim.size());
-    for (int i = 0; i < x_dim.size(); ++i) {
-      if ((!ctx->IsRuntime()) && (x_dim[i] == -1)) {
-        out_dims[i] = -1;
-      } else {
-        out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
-      }
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
-    if (out_dims[0] == x_dim[0]) {
-      // Only pass LoD when the first dimension is equal between
-      // output and input.
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
   }
 };
 
@@ -160,10 +131,13 @@ class PadOpDoubleGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(pad, PadInferShapeFunctor,
+                            PD_INFER_META(phi::PadInferMeta));
 
 REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker,
                   ops::PadOpGradMaker<paddle::framework::OpDesc>,
-                  ops::PadOpGradMaker<paddle::imperative::OpBase>);
+                  ops::PadOpGradMaker<paddle::imperative::OpBase>,
+                  PadInferShapeFunctor);
 REGISTER_OPERATOR(pad_grad, ops::PadOpGrad,
                   ops::PadOpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::PadOpDoubleGradMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/pad_op_npu.cc b/paddle/fluid/operators/pad_op_npu.cc
index d0cb674b4049f..adc4a2ffaf8c5 100644
--- a/paddle/fluid/operators/pad_op_npu.cc
+++ b/paddle/fluid/operators/pad_op_npu.cc
@@ -90,5 +90,5 @@ namespace plat = paddle::platform;
 REGISTER_OP_NPU_KERNEL(pad, ops::PadNPUKernel<plat::float16>,
                        ops::PadNPUKernel<float>, ops::PadNPUKernel<int>);
 
-REGISTER_OP_NPU_KERNEL(pad_grad, ops::PadNPUKernel<plat::float16>,
+REGISTER_OP_NPU_KERNEL(pad_grad, ops::PadGradNPUKernel<plat::float16>,
                        ops::PadGradNPUKernel<float>);
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
deleted file mode 100644
index 6335004e69a37..0000000000000
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ /dev/null
@@ -1,567 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/pool_op.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/operator.h"
-#endif
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor;
-using DataLayout = platform::DataLayout;
-using PoolingMode = platform::PoolingMode;
-template <typename T>
-using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
-
-DataLayout getLayoutFromStr(std::string data_format) {
-  if (data_format == "NHWC") {
-    return DataLayout::kNHWC;
-  } else if (data_format == "NCHW") {
-    return DataLayout::kNCHW;
-  } else if (data_format == "NCDHW") {
-    return DataLayout::kNCDHW;
-  } else {
-    return DataLayout::kNCDHW;
-  }
-}
-
-template <typename T>
-class PoolCUDNNOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument("Pool operator CUDA kernel must use "
-                                          "CUDAPlace rather than CPUPlace."));
-
-    const Tensor *input = ctx.Input<Tensor>("X");
-    Tensor *output = ctx.Output<Tensor>("Out");
-    output->mutable_data<T>(ctx.GetPlace());
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    bool exclusive = ctx.Attr<bool>("exclusive");
-    bool adaptive = ctx.Attr<bool>("adaptive");
-    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    bool global_pooling = ctx.Attr<bool>("global_pooling");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    // update paddings
-    auto in_x_dims = input->dims();
-    framework::DDim data_dims;
-    if (channel_last) {
-      data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-    } else {
-      data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-    }
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
-    if (data_dims.size() * 2 == static_cast<int>(paddings.size())) {
-      for (int i = 0; i < data_dims.size(); ++i) {
-        paddings.erase(paddings.begin() + i + 1);
-      }
-    }
-
-    if (global_pooling) {
-      UpdateKsize(&ksize, data_dims);
-    }
-
-    const std::string str_NCHW = "NCHW", str_NHWC = "NHWC";
-    const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC";
-
-    // -----------------transformed tensor ------------------------
-
-    Tensor transformed_input(input->type());
-    Tensor transformed_output(output->type());
-    DataLayout layout;
-
-    if (data_format == str_NDHWC) {
-      layout = DataLayout::kNCDHW;
-      auto &dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-      std::vector<int> axis{0, 4, 1, 2, 3};
-
-      // input
-      transformed_input.Resize(input->dims());
-
-      auto in_dims_vec = phi::vectorize(input->dims());
-      in_dims_vec[1] = input->dims()[4];
-      in_dims_vec[2] = input->dims()[1];
-      in_dims_vec[3] = input->dims()[2];
-      in_dims_vec[4] = input->dims()[3];
-      transformed_input.Resize(phi::make_ddim(in_dims_vec));
-      transformed_input.mutable_data(ctx.GetPlace(), input->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5;
-      trans5(dev_ctx, *input, &transformed_input, axis);
-
-      // output
-      transformed_output.Resize(output->dims());
-
-      auto out_dims_vec = phi::vectorize(output->dims());
-      out_dims_vec[1] = output->dims()[4];
-      out_dims_vec[2] = output->dims()[1];
-      out_dims_vec[3] = output->dims()[2];
-      out_dims_vec[4] = output->dims()[3];
-      transformed_output.Resize(phi::make_ddim(out_dims_vec));
-#ifdef PADDLE_WITH_HIP
-      // MIOPEN not support NHWC data layout
-    } else if (data_format == str_NHWC) {
-      layout = DataLayout::kNCHW;
-      auto &dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-      std::vector<int> axis{0, 3, 1, 2};
-
-      transformed_input.Resize(input->dims());
-      auto in_dims_vec = phi::vectorize(input->dims());
-      in_dims_vec[1] = input->dims()[3];
-      in_dims_vec[2] = input->dims()[1];
-      in_dims_vec[3] = input->dims()[2];
-      transformed_input.Resize(phi::make_ddim(in_dims_vec));
-      transformed_input.mutable_data(ctx.GetPlace(), input->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans;
-      trans(dev_ctx, *input, &transformed_input, axis);
-
-      transformed_output.Resize(output->dims());
-      auto out_dims_vec = phi::vectorize(output->dims());
-      out_dims_vec[1] = output->dims()[3];
-      out_dims_vec[2] = output->dims()[1];
-      out_dims_vec[3] = output->dims()[2];
-      transformed_output.Resize(phi::make_ddim(out_dims_vec));
-#endif
-    } else {
-      layout = getLayoutFromStr(data_format);
-      transformed_input = *input;
-      transformed_output = *output;
-    }
-
-    const T *tranformed_input_data = transformed_input.data<T>();
-    T *tranformed_output_data = transformed_output.mutable_data<T>(
-        transformed_output.dims(), ctx.GetPlace());
-
-    // ------------------- cudnn descriptors ---------------------
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor output_desc;
-    ScopedPoolingDescriptor pool_desc;
-
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_input.dims()));
-    miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_output.dims()));
-#else
-    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_input.dims()));
-    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_output.dims()));
-#endif
-    PoolingMode pooling_mode;
-    if (pooling_type == "max") {
-      pooling_mode = PoolingMode::kMaximum;
-    } else {
-      pooling_mode = exclusive ? PoolingMode::kAverageExclusive
-                               : PoolingMode::kAverageInclusive;
-    }
-
-#ifdef PADDLE_WITH_HIP
-    miopenPoolingDescriptor_t cudnn_pool_desc =
-        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
-#else
-    cudnnPoolingDescriptor_t cudnn_pool_desc =
-        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
-#endif
-
-    // ------------------- cudnn pool algorithm ---------------------
-    auto handle = ctx.cuda_device_context().cudnn_handle();
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-
-#ifdef PADDLE_WITH_HIP
-    char *pool_workspace;
-    size_t pool_worksize = 0;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenPoolingGetWorkSpaceSizeV2(
-            cudnn_pool_desc, cudnn_output_desc, &pool_worksize));
-    PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize));
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingForward(
-        handle, cudnn_pool_desc, &alpha, cudnn_input_desc,
-        tranformed_input_data, &beta, cudnn_output_desc, tranformed_output_data,
-        false, pool_workspace, pool_worksize));
-    PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingForward(
-        handle, cudnn_pool_desc, &alpha, cudnn_input_desc,
-        tranformed_input_data, &beta, cudnn_output_desc,
-        tranformed_output_data));
-#endif
-    // add
-    if (data_format == str_NDHWC) {
-      auto &dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-      std::vector<int> axis{0, 2, 3, 4, 1};
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5>
-          trans5_v2;
-      trans5_v2(dev_ctx, transformed_output, output, axis);
-    }
-#ifdef PADDLE_WITH_HIP
-    // MIOPEN not support NHWC data layout
-    if (data_format == str_NHWC) {
-      auto &dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-      std::vector<int> axis{0, 2, 3, 1};
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans;
-      trans(dev_ctx, transformed_output, output, axis);
-    }
-#endif
-  }
-};
-
-template <typename T>
-class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument("Pool operator CUDA kernel must use "
-                                          "CUDAPlace rather than CPUPlace."));
-
-    const Tensor *input = ctx.Input<Tensor>("X");
-    const Tensor *output = ctx.Input<Tensor>("Out");
-    const Tensor *output_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor *input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    bool exclusive = ctx.Attr<bool>("exclusive");
-    bool adaptive = ctx.Attr<bool>("adaptive");
-    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    bool global_pooling = ctx.Attr<bool>("global_pooling");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-#ifdef PADDLE_WITH_HIP
-    if (pooling_type == "max") {
-      using OpKernelMap = paddle::framework::OperatorWithKernel::OpKernelMap;
-      using OpKernelFunc = paddle::framework::OperatorWithKernel::OpKernelFunc;
-      auto &all_op_kernels =
-          paddle::framework::OperatorWithKernel::AllOpKernels();
-      std::string op_type = "pool2d_grad";
-      auto kernels_iter = all_op_kernels.find(op_type);
-      PADDLE_ENFORCE_NE(
-          kernels_iter, all_op_kernels.end(),
-          platform::errors::Unavailable(
-              "There are no kernels which are registered in the %s operator.",
-              op_type));
-      OpKernelMap &kernels = kernels_iter->second;
-      paddle::framework::OpKernelType expected_kernel_key(
-          paddle::framework::ToDataType(typeid(T)), ctx.GetPlace());
-      auto kernel_iter = kernels.find(expected_kernel_key);
-      PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
-                        platform::errors::NotFound(
-                            "Operator (%s) does not have kernel for %s.",
-                            op_type, KernelTypeToString(expected_kernel_key)));
-      std::unique_ptr<OpKernelFunc> kernel_func_(
-          new OpKernelFunc(kernel_iter->second));
-      (*kernel_func_)(ctx);
-      return;
-    }
-#endif
-
-    // update paddings
-    auto in_x_dims = input->dims();
-    framework::DDim data_dims;
-    if (channel_last) {
-      data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-    } else {
-      data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-    }
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
-    if (data_dims.size() * 2 == static_cast<int>(paddings.size())) {
-      for (int i = 0; i < data_dims.size(); ++i) {
-        paddings.erase(paddings.begin() + i + 1);
-      }
-    }
-
-    if (global_pooling) {
-      UpdateKsize(&ksize, data_dims);
-    }
-
-    // ------- tensor grad --------------
-    Tensor transformed_input(input->type());
-    Tensor transformed_output(output->type());
-    Tensor transformed_output_grad(output_grad->type());
-
-    input_grad->mutable_data<T>(ctx.GetPlace());
-    Tensor transformed_input_grad(input_grad->type());
-    DataLayout layout;
-    const std::string str_NCHW = "NCHW", str_NHWC = "NHWC";
-    const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC";
-    if (data_format == str_NDHWC) {
-      layout = DataLayout::kNCDHW;
-      auto &dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-      std::vector<int> axis{0, 4, 1, 2, 3};
-
-      // input
-      transformed_input.Resize(input->dims());
-      auto in_dims_vec = phi::vectorize(input->dims());
-      in_dims_vec[1] = input->dims()[4];
-      in_dims_vec[2] = input->dims()[1];
-      in_dims_vec[3] = input->dims()[2];
-      in_dims_vec[4] = input->dims()[3];
-      transformed_input.Resize(phi::make_ddim(in_dims_vec));
-      transformed_input.mutable_data(ctx.GetPlace(), input->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5;
-      trans5(dev_ctx, *input, &transformed_input, axis);
-
-      // output
-      transformed_output.Resize(output->dims());
-      auto out_dims_vec = phi::vectorize(output->dims());
-      out_dims_vec[1] = output->dims()[4];
-      out_dims_vec[2] = output->dims()[1];
-      out_dims_vec[3] = output->dims()[2];
-      out_dims_vec[4] = output->dims()[3];
-      transformed_output.Resize(phi::make_ddim(out_dims_vec));
-
-      transformed_output.mutable_data(ctx.GetPlace(), output->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5>
-          trans5_v2;
-      trans5_v2(dev_ctx, *output, &transformed_output, axis);
-
-      // output grad
-      transformed_output_grad.Resize(phi::make_ddim(out_dims_vec));
-      transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5>
-          trans5_v3;
-      trans5_v3(dev_ctx, *output_grad, &transformed_output_grad, axis);
-
-      // input grad
-      transformed_input_grad.Resize(phi::make_ddim(in_dims_vec));
-
-#ifdef PADDLE_WITH_HIP
-      // MIOPEN not support NHWC data layout
-    } else if (data_format == str_NHWC) {
-      layout = DataLayout::kNCHW;
-      auto &dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-      std::vector<int> axis{0, 3, 1, 2};
-
-      // input
-      transformed_input.Resize(input->dims());
-      auto in_dims_vec = phi::vectorize(input->dims());
-      in_dims_vec[1] = input->dims()[3];
-      in_dims_vec[2] = input->dims()[1];
-      in_dims_vec[3] = input->dims()[2];
-      transformed_input.Resize(phi::make_ddim(in_dims_vec));
-      transformed_input.mutable_data(ctx.GetPlace(), input->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans4;
-      trans4(dev_ctx, *input, &transformed_input, axis);
-
-      // output
-      transformed_output.Resize(output->dims());
-      auto out_dims_vec = phi::vectorize(output->dims());
-      out_dims_vec[1] = output->dims()[3];
-      out_dims_vec[2] = output->dims()[1];
-      out_dims_vec[3] = output->dims()[2];
-      transformed_output.Resize(phi::make_ddim(out_dims_vec));
-
-      transformed_output.mutable_data(ctx.GetPlace(), output->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4>
-          trans4_v2;
-      trans4_v2(dev_ctx, *output, &transformed_output, axis);
-
-      // output grad
-      transformed_output_grad.Resize(phi::make_ddim(out_dims_vec));
-      transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4>
-          trans4_v3;
-      trans4_v3(dev_ctx, *output_grad, &transformed_output_grad, axis);
-
-      // input grad
-      transformed_input_grad.Resize(phi::make_ddim(in_dims_vec));
-#endif
-    } else {
-      layout = getLayoutFromStr(data_format);
-      transformed_input = *input;
-      transformed_output = *output;
-      transformed_output_grad = *output_grad;
-      transformed_input_grad = *input_grad;
-    }
-
-    const T *input_data = transformed_input.data<T>();
-    const T *output_data = transformed_output.data<T>();
-    const T *output_grad_data = transformed_output_grad.data<T>();
-
-    // ------------------- cudnn descriptors ---------------------
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor output_desc;
-    ScopedPoolingDescriptor pool_desc;
-
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_input.dims()));
-    miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_output.dims()));
-#else
-    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_input.dims()));
-    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_output.dims()));
-#endif
-    PoolingMode pooling_mode;
-    if (pooling_type == "max") {
-      if (FLAGS_cudnn_deterministic) {
-        pooling_mode = PoolingMode::kMaximumDeterministic;
-      } else {
-        pooling_mode = PoolingMode::kMaximum;
-      }
-    } else {
-      pooling_mode = exclusive ? PoolingMode::kAverageExclusive
-                               : PoolingMode::kAverageInclusive;
-    }
-
-#ifdef PADDLE_WITH_HIP
-    miopenPoolingDescriptor_t cudnn_pool_desc =
-        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
-#else
-    cudnnPoolingDescriptor_t cudnn_pool_desc =
-        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
-#endif
-
-    // ------------------- cudnn pool algorithm ---------------------
-    auto handle = ctx.cuda_device_context().cudnn_handle();
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    if (input_grad) {
-      T *input_grad_data = transformed_input_grad.mutable_data<T>(
-          transformed_input_grad.dims(), ctx.GetPlace());
-// Because beta is zero, it is unnecessary to reset input_grad.
-#ifdef PADDLE_WITH_HIP
-      char *pool_workspace;
-      size_t pool_worksize = 0;
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::miopenPoolingGetWorkSpaceSizeV2(
-              cudnn_pool_desc, cudnn_output_desc, &pool_worksize));
-      PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize));
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingBackward(
-          handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
-          cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
-          &beta, cudnn_input_desc, input_grad_data, pool_workspace));
-      PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingBackward(
-          handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
-          cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
-          &beta, cudnn_input_desc, input_grad_data));
-#endif
-
-      if (data_format == str_NDHWC) {
-        auto &dev_ctx =
-            ctx.template device_context<paddle::platform::CUDADeviceContext>();
-        std::vector<int> axis{0, 2, 3, 4, 1};
-        phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5>
-            trans5_v4;
-        trans5_v4(dev_ctx, transformed_input_grad, input_grad, axis);
-      }
-#ifdef PADDLE_WITH_HIP
-      // MIOPEN not support NHWC data layout
-      if (data_format == str_NHWC) {
-        auto &dev_ctx =
-            ctx.template device_context<paddle::platform::CUDADeviceContext>();
-        std::vector<int> axis{0, 2, 3, 1};
-        phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4>
-            trans4_v4;
-        trans4_v4(dev_ctx, transformed_input_grad, input_grad, axis);
-      }
-#endif
-    }
-  }
-};
-
-template <typename T>
-class PoolCUDNNGradGradOpKernel : public PoolCUDNNOpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    if (pooling_type == "max") {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Pool op grad grad only supports avgpool."));
-    } else {
-      PoolCUDNNOpKernel<T>::Compute(ctx);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNOpKernel<float>,
-                   ops::PoolCUDNNOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<plat::float16>);
-
-REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNOpKernel<float>,
-                   ops::PoolCUDNNOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNGradOpKernel<float>);
-#else
-REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNOpKernel<float>,
-                   ops::PoolCUDNNOpKernel<double>,
-                   ops::PoolCUDNNOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<double>,
-                   ops::PoolCUDNNGradOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(pool2d_grad_grad, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNGradGradOpKernel<float>,
-                   ops::PoolCUDNNGradGradOpKernel<double>,
-                   ops::PoolCUDNNGradGradOpKernel<plat::float16>);
-
-REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNOpKernel<float>,
-                   ops::PoolCUDNNOpKernel<double>,
-                   ops::PoolCUDNNOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<double>);
-#endif
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index ae095c2fa7aaa..44f3d8090e565 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -15,6 +15,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/pool_op.h"
 
 #include <unordered_map>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
+
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -23,125 +29,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-int PoolOutputSize(int input_size, int filter_size, int padding_1,
-                   int padding_2, int stride, bool ceil_mode) {
-  int output_size;
-  if (!ceil_mode) {
-    output_size =
-        (input_size - filter_size + padding_1 + padding_2) / stride + 1;
-  } else {
-    output_size =
-        (input_size - filter_size + padding_1 + padding_2 + stride - 1) /
-            stride +
-        1;
-  }
-  PADDLE_ENFORCE_GT(
-      output_size, 0,
-      platform::errors::InvalidArgument(
-          "the output size must be greater than 0. But received: "
-          "output_size = %d due to the settings of input_size(%d), "
-          "padding(%d,%d), "
-          "k_size(%d) and stride(%d). Please check again!",
-          output_size, input_size, padding_1, padding_2, filter_size, stride));
-  return output_size;
-}
-
-void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(
-      ctx->HasInput("X"), true,
-      platform::errors::NotFound("Input(X) of Pool operator is not found."));
-  PADDLE_ENFORCE_EQ(
-      ctx->HasOutput("Out"), true,
-      platform::errors::NotFound("Output(Out) of Pool operator is not found."));
-
-  std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
-  std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
-  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-  bool ceil_mode = ctx->Attrs().Get<bool>("ceil_mode");
-  bool adaptive = ctx->Attrs().Get<bool>("adaptive");
-  bool global_pooling = ctx->Attrs().Get<bool>("global_pooling");
-  std::string data_format = ctx->Attrs().Get<std::string>("data_format");
-  std::string padding_algorithm =
-      ctx->Attrs().Get<std::string>("padding_algorithm");
-
-  auto in_x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(
-      in_x_dims.size() == 4 || in_x_dims.size() == 5, true,
-      platform::errors::InvalidArgument(
-          "the input of Op(pool) should be 4-D or 5-D Tensor. But "
-          "received: %u-D Tensor and it's shape is [%s].",
-          in_x_dims.size(), in_x_dims));
-
-  PADDLE_ENFORCE_EQ(
-      in_x_dims.size() - ksize.size(), 2U,
-      platform::errors::InvalidArgument(
-          "the dimension of input minus the size of "
-          "Attr(ksize) must be euqal to 2 in Op(pool). "
-          "But received: the dimension of input minus the size "
-          "of Attr(ksize) is %d, the "
-          "input's dimension is %d, the shape of input "
-          "is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].",
-          in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims,
-          ksize.size(), phi::make_ddim(ksize)));
-
-  PADDLE_ENFORCE_EQ(
-      ksize.size(), strides.size(),
-      platform::errors::InvalidArgument(
-          "the size of Attr(ksize) and Attr(strides) in "
-          "Op(pool) must be equal. "
-          "But received: Attr(ksize)'s size is %d, Attr(strides)'s "
-          "size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].",
-          ksize.size(), strides.size(), phi::make_ddim(ksize),
-          phi::make_ddim(strides)));
-
-  // MKL-DNN Kernels are using NCHW order of dims description
-  // so we ignore data_format consideration for MKL-DNN kernel
-  const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) &&
-                            (data_format == "NHWC" || data_format == "NDHWC");
-
-  // update paddings if "SAME" or global_pooling
-  framework::DDim data_dims;
-  if (channel_last) {
-    data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-  } else {
-    data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-  }
-  UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                data_dims, strides, ksize);
-
-  if (global_pooling) {
-    UpdateKsize(&ksize, data_dims);
-  }
-
-  std::vector<int64_t> output_shape;
-  if (adaptive) {
-    output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
-  } else {
-    for (int i = 0; i < data_dims.size(); ++i) {
-      if ((!ctx->IsRuntime()) && (data_dims[i] < 0)) {
-        output_shape.push_back(data_dims[i]);
-      } else {
-        output_shape.push_back(
-            PoolOutputSize(data_dims[i], ksize[i], paddings[2 * i],
-                           paddings[2 * i + 1], strides[i], ceil_mode));
-      }
-    }
-  }
-
-  // output_N = input_N
-  output_shape.insert(output_shape.begin(), in_x_dims[0]);
-  // output_C = input_C
-  if (channel_last) {
-    output_shape.push_back(in_x_dims[in_x_dims.size() - 1]);
-  } else {
-    output_shape.insert(output_shape.begin() + 1, in_x_dims[1]);
-  }
-
-  ctx->SetOutputDim("Out", phi::make_ddim(output_shape));
-  ctx->ShareLoD("X", "Out");
-}
-
 bool CanMKLDNNSupportPool(const framework::ExecutionContext& ctx) {
   if (ctx.Attr<bool>("adaptive") == false) return true;
   // (jczaja): oneDNN is supporting only unchangable in size pool window
@@ -216,16 +103,6 @@ framework::OpKernelType PoolOp::GetKernelTypeForVar(
                                  tensor.place(), tensor.layout());
 }
 
-void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                    platform::errors::NotFound(
-                        "Input(X) of Pool Gradoperator is not found."));
-  PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
-                    platform::errors::NotFound(
-                        "Input(X@GRAD) of Pool Gradoperator is not found."));
-  ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-}
-
 framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
@@ -471,7 +348,7 @@ class Pool2dOpGradGradMaker : public framework::SingleGradOpMaker<T> {
 
  protected:
   void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("pool2d_grad_grad");
+    grad_op->SetType("pool2d_double_grad");
     grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
     grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
     grad_op->SetAttrMap(this->Attrs());
@@ -692,35 +569,34 @@ width, respectively. The input(X) size and output(Out) size may be different.
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(pool2d, Pool2dInferShapeFunctor,
+                            PD_INFER_META(phi::PoolInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(pool2d_grad, Pool2dGradInferShapeFunctor,
+                            PD_INFER_META(phi::PoolGradInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(pool2d_double_grad,
+                            Pool2dDoubleGradInferShapeFunctor,
+                            PD_INFER_META(phi::PoolInferMeta));
+
 REGISTER_OPERATOR(
     pool2d, ops::PoolOp, ops::Pool2dOpMaker, ops::PoolOpInferVarType,
     paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
-    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
+    Pool2dInferShapeFunctor);
 REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad,
                   ops::Pool2dOpGradGradMaker<paddle::framework::OpDesc>,
-                  ops::Pool2dOpGradGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(pool2d_grad_grad, ops::PoolOp);
-
-REGISTER_OP_CPU_KERNEL(
-    pool2d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    pool2d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    pool2d_grad_grad,
-    ops::PoolGradGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolGradGradKernel<paddle::platform::CPUDeviceContext, double>);
+                  ops::Pool2dOpGradGradMaker<paddle::imperative::OpBase>,
+                  Pool2dGradInferShapeFunctor);
+REGISTER_OPERATOR(pool2d_double_grad, ops::PoolOp,
+                  Pool2dDoubleGradInferShapeFunctor);
+
+DECLARE_INFER_SHAPE_FUNCTOR(pool3d, Pool3dInferShapeFunctor,
+                            PD_INFER_META(phi::PoolInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(pool3d_grad, Pool3dGradInferShapeFunctor,
+                            PD_INFER_META(phi::PoolGradInferMeta));
 
 REGISTER_OPERATOR(
     pool3d, ops::PoolOp, ops::Pool3dOpMaker, ops::PoolOpInferVarType,
     paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
-    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
-REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    pool3d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    pool3d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
+    Pool3dInferShapeFunctor);
+REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad, Pool3dGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/pool_op.cu b/paddle/fluid/operators/pool_op.cu
deleted file mode 100644
index 069ce0c1fda85..0000000000000
--- a/paddle/fluid/operators/pool_op.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/pool_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    pool2d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PoolKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    pool2d_grad,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    pool2d_grad_grad,
-    ops::PoolGradGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolGradGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PoolGradGradKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    pool3d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PoolKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    pool3d_grad,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>);
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index bea6506ee86db..d48ac3bd358ef 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -12,19 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
+// NOTE(Ruibiao): Difficult to remove code from this header file because too
+// many files rely on it through "mkldnn_reuse.h"
 
-#include <algorithm>
-#include <string>
-#include <vector>
+#pragma once
 
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/pooling.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#if defined(__HIPCC__) || defined(__NVCC__)
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#endif
 
 namespace paddle {
 namespace operators {
@@ -35,8 +28,6 @@ class PoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override;
@@ -50,8 +41,6 @@ class PoolOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override;
@@ -71,292 +60,5 @@ class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override;
 };
 
-template <typename T = int>
-inline void UpdatePadding(std::vector<T>* paddings, const bool global_pooling,
-                          const bool adaptive,
-                          const std::string padding_algorithm,
-                          const framework::DDim data_dims,
-                          const std::vector<T>& strides,
-                          const std::vector<T>& ksize) {
-  // set padding size == data_dims.size() * 2
-  auto data_shape = phi::vectorize<T>(data_dims);
-  if (static_cast<int>(paddings->size()) == data_dims.size()) {
-    for (int i = 0; i < data_dims.size(); ++i) {
-      T copy_pad = *(paddings->begin() + 2 * i);
-      paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
-    }
-  } else {
-    PADDLE_ENFORCE_EQ(data_dims.size() * 2, paddings->size(),
-                      platform::errors::InvalidArgument(
-                          "Paddings size %d should be the same or twice as the "
-                          "pooling size %d.",
-                          paddings->size(), data_dims.size() * 2));
-  }
-
-  // when padding_algorithm is "VALID" or "SAME"
-  if (padding_algorithm == "SAME") {
-    for (int i = 0; i < data_dims.size(); ++i) {
-      T out_size = (data_dims[i] + strides[i] - 1) / strides[i];
-      T pad_sum =
-          std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i],
-                   static_cast<T>(0));
-      T pad_0 = pad_sum / 2;
-      T pad_1 = pad_sum - pad_0;
-      *(paddings->begin() + i * 2) = pad_0;
-      *(paddings->begin() + i * 2 + 1) = pad_1;
-    }
-  } else if (padding_algorithm == "VALID") {
-    for (auto it = paddings->begin(); it != paddings->end(); it++) {
-      *it = 0;
-    }
-  }
-
-  // if global_pooling == true or adaptive == true, padding will be ignore
-  if (global_pooling || adaptive) {
-    for (auto it = paddings->begin(); it != paddings->end(); it++) {
-      *it = 0;
-    }
-  }
-}
-
-template <typename T = int>
-inline void UpdateKsize(std::vector<T>* ksize,
-                        const framework::DDim data_dims) {
-  ksize->resize(static_cast<size_t>(data_dims.size()));
-  for (size_t i = 0; i < ksize->size(); ++i) {
-    *(ksize->begin() + i) = static_cast<T>(data_dims[i]);
-  }
-}
-
-inline int getReduceNum(const framework::Tensor& input,
-                        const framework::Tensor* output,
-                        const std::string data_format,
-                        std::vector<int>* reduce_dim) {
-  // data_format only can be NCHW
-  bool channel_last = (data_format == "NHWC");
-  if (channel_last) {
-    return 0;
-  }
-  int reduce_num = 0;
-  const int output_height = output->dims()[2];
-  const int output_width = output->dims()[3];
-  if ((output_height == 1) && (output_width == 1)) {
-    reduce_dim->push_back(2);
-    reduce_dim->push_back(3);
-    reduce_num = input.dims()[2] * input.dims()[3];
-  }
-  return reduce_num;
-}
-
-template <typename DeviceContext, typename T>
-class PoolKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-
-    std::string pooling_type = context.Attr<std::string>("pooling_type");
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::string data_format = context.Attr<std::string>("data_format");
-    bool exclusive = context.Attr<bool>("exclusive");
-    bool adaptive = context.Attr<bool>("adaptive");
-    bool global_pooling = context.Attr<bool>("global_pooling");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    // update paddings
-    auto in_x_dims = in_x->dims();
-    framework::DDim data_dims;
-    if (channel_last) {
-      data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-    } else {
-      data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-    }
-
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
-    if (data_dims.size() * 2 == static_cast<int>(paddings.size())) {
-      for (int i = 0; i < data_dims.size(); ++i) {
-        paddings.erase(paddings.begin() + i + 1);
-      }
-    }
-
-    if (global_pooling) {
-      UpdateKsize(&ksize, data_dims);
-    }
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    switch (ksize.size()) {
-      case 2: {
-        if (pooling_type == "max") {
-          paddle::operators::math::Pool2dFunctor<
-              DeviceContext, paddle::operators::math::MaxPool<T>, T>
-              pool2d_forward;
-          paddle::operators::math::MaxPool<T> pool_process;
-          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format,
-                         true, false, out, pool_process);
-
-        } else if (pooling_type == "avg") {
-          std::vector<int> reduce_dim;
-          int reduce_num = getReduceNum(*in_x, out, data_format, &reduce_dim);
-          if (reduce_num > 0 &&
-              adaptive) {  // for adaptive_avg_pool2d && output_size == 1
-#if defined(__HIPCC__) || defined(__NVCC__)
-            auto stream = dev_ctx.stream();
-            TensorReduceImpl<T, T, kps::AddFunctor, kps::DivideFunctor<T>>(
-                dev_ctx, *in_x, out, kps::DivideFunctor<T>(reduce_num),
-                reduce_dim, stream);
-#else  // for cpu
-            paddle::operators::math::Pool2dFunctor<
-                DeviceContext, paddle::operators::math::AvgPool<T>, T>
-                pool2d_forward;
-            paddle::operators::math::AvgPool<T> pool_process;
-            pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings,
-                           data_format, exclusive, adaptive, out, pool_process);
-#endif
-          } else {  // avgpool_2d or  adaptive_avg_pool2d && output_size != 1
-            paddle::operators::math::Pool2dFunctor<
-                DeviceContext, paddle::operators::math::AvgPool<T>, T>
-                pool2d_forward;
-            paddle::operators::math::AvgPool<T> pool_process;
-            pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings,
-                           data_format, exclusive, adaptive, out, pool_process);
-          }
-        }
-      } break;
-      case 3: {
-        if (pooling_type == "max") {
-          paddle::operators::math::Pool3dFunctor<
-              DeviceContext, paddle::operators::math::MaxPool<T>, T>
-              pool3d_forward;
-          paddle::operators::math::MaxPool<T> pool_process;
-          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format,
-                         true, false, out, pool_process);
-
-        } else if (pooling_type == "avg") {
-          paddle::operators::math::Pool3dFunctor<
-              DeviceContext, paddle::operators::math::AvgPool<T>, T>
-              pool3d_forward;
-          paddle::operators::math::AvgPool<T> pool_process;
-          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format,
-                         exclusive, adaptive, out, pool_process);
-        }
-      } break;
-      default: {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Pool op only supports 2D and 3D input."));
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PoolGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    const Tensor* out = context.Input<Tensor>("Out");
-    const Tensor* out_grad =
-        context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-
-    std::string pooling_type = context.Attr<std::string>("pooling_type");
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    bool exclusive = context.Attr<bool>("exclusive");
-    bool adaptive = context.Attr<bool>("adaptive");
-    std::string data_format = context.Attr<std::string>("data_format");
-    bool global_pooling = context.Attr<bool>("global_pooling");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    // update paddings
-    auto in_x_dims = in_x->dims();
-    framework::DDim data_dims;
-    if (channel_last) {
-      data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-    } else {
-      data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-    }
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
-    if (data_dims.size() * 2 == static_cast<int>(paddings.size())) {
-      for (int i = 0; i < data_dims.size(); ++i) {
-        paddings.erase(paddings.begin() + i + 1);
-      }
-    }
-
-    if (global_pooling) {
-      UpdateKsize(&ksize, data_dims);
-    }
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    if (in_x_grad) {
-      in_x_grad->mutable_data<T>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T> set_constant;
-      set_constant(dev_ctx, in_x_grad, static_cast<T>(0.0));
-
-      switch (ksize.size()) {
-        case 2: {
-          if (pooling_type == "max") {
-            paddle::operators::math::MaxPool2dGradFunctor<DeviceContext, T>
-                pool2d_backward;
-            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, data_format, in_x_grad);
-          } else if (pooling_type == "avg") {
-            paddle::operators::math::Pool2dGradFunctor<
-                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
-                pool2d_backward;
-            paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, data_format, exclusive, adaptive,
-                            in_x_grad, pool_process);
-          }
-        } break;
-        case 3: {
-          if (pooling_type == "max") {
-            paddle::operators::math::MaxPool3dGradFunctor<DeviceContext, T>
-                pool3d_backward;
-            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, data_format, in_x_grad);
-          } else if (pooling_type == "avg") {
-            paddle::operators::math::Pool3dGradFunctor<
-                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
-                pool3d_backward;
-            paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, data_format, exclusive, adaptive,
-                            in_x_grad, pool_process);
-          }
-        } break;
-        default: {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Pool op only supports 2D and 3D input."));
-        }
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PoolGradGradKernel : public PoolKernel<DeviceContext, T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::string pooling_type = context.Attr<std::string>("pooling_type");
-    if (pooling_type == "max") {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Pool op grad grad only supports avgpool."));
-    } else {
-      PoolKernel<DeviceContext, T>::Compute(context);
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/pool_op_mlu.cc b/paddle/fluid/operators/pool_op_mlu.cc
index 08656e64231b6..fa88d128a9a1d 100644
--- a/paddle/fluid/operators/pool_op_mlu.cc
+++ b/paddle/fluid/operators/pool_op_mlu.cc
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/pool_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 namespace paddle {
 namespace operators {
@@ -80,10 +81,10 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
       data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
     }
 
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
+    phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive,
+                              padding_algorithm, data_dims, strides, ksize);
     if (global_pooling) {
-      UpdateKsize(&ksize, data_dims);
+      phi::funcs::UpdateKernelSize(&ksize, data_dims);
     }
 
     MLUCnnlTensorDesc in_x_desc(*in_x, cnnl_layout, ToCnnlDataType<T>());
@@ -191,10 +192,10 @@ class MLUPoolGradOpKernel : public framework::OpKernel<T> {
       data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
     }
 
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
+    phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive,
+                              padding_algorithm, data_dims, strides, ksize);
     if (global_pooling) {
-      UpdateKsize(&ksize, data_dims);
+      phi::funcs::UpdateKernelSize(&ksize, data_dims);
     }
 
     // inputs need with NHWC layout
diff --git a/paddle/fluid/operators/pool_op_npu.cc b/paddle/fluid/operators/pool_op_npu.cc
index bd26d6350d9c3..0efcb8b7981c3 100644
--- a/paddle/fluid/operators/pool_op_npu.cc
+++ b/paddle/fluid/operators/pool_op_npu.cc
@@ -11,8 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/pool_op.h"
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 namespace paddle {
 namespace operators {
@@ -68,8 +70,8 @@ class NPUPoolOpKernel : public framework::OpKernel<T> {
       strides_vec[2] = strides[0];
       strides_vec[3] = strides[1];
     }
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
+    phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive,
+                              padding_algorithm, data_dims, strides, ksize);
     PADDLE_ENFORCE_LT(
         std::max(paddings[0], paddings[1]), ksize[0],
         platform::errors::InvalidArgument(
@@ -201,8 +203,8 @@ class NPUPoolGradOpKernel : public framework::OpKernel<T> {
       strides_vec[2] = strides[0];
       strides_vec[3] = strides[1];
     }
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
+    phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive,
+                              padding_algorithm, data_dims, strides, ksize);
 
     PADDLE_ENFORCE_LT(
         std::max(paddings[0], paddings[1]), ksize[0],
diff --git a/paddle/fluid/operators/pool_op_xpu.cc b/paddle/fluid/operators/pool_op_xpu.cc
index 402dd6c108039..87c437d8a78e0 100644
--- a/paddle/fluid/operators/pool_op_xpu.cc
+++ b/paddle/fluid/operators/pool_op_xpu.cc
@@ -8,13 +8,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/pool_op.h"
+
 #include <unordered_map>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
 
 #ifdef PADDLE_WITH_XPU
 namespace paddle {
 namespace operators {
 
+using framework::Tensor;
+
 xpu::Pooling_t XPUPoolingType(const std::string& pooltype, bool exclusive,
                               bool is_test) {
   if (pooltype == "max") {
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index d061f9ae05613..e0341f4a4b471 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/pool_with_index_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -28,71 +32,6 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of Pooling should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of Pooling should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Mask"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Mask) of Pooling should not be null."));
-
-    auto in_x_dims = ctx->GetInputDim("X");
-
-    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    bool adaptive = ctx->Attrs().Get<bool>("adaptive");
-
-    PADDLE_ENFORCE(
-        in_x_dims.size() == 4 || in_x_dims.size() == 5,
-        platform::errors::InvalidArgument("Pooling intput should be 4-D or 5-D "
-                                          "tensor but received %dD-Tensor",
-                                          in_x_dims.size()));
-
-    if (ctx->Attrs().Get<bool>("global_pooling")) {
-      ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x_dims[i + 2]);
-      }
-    }
-
-    PADDLE_ENFORCE_EQ(
-        in_x_dims.size() - ksize.size(), 2U,
-        platform::errors::InvalidArgument(
-            "The input size %d minus the kernel size %d should equal to 2.",
-            in_x_dims.size(), ksize.size()));
-    PADDLE_ENFORCE_EQ(
-        ksize.size(), strides.size(),
-        platform::errors::InvalidArgument(
-            "Strides size %d and pooling size %d should be the same.",
-            strides.size(), ksize.size()));
-    PADDLE_ENFORCE_EQ(
-        ksize.size(), paddings.size(),
-        platform::errors::InvalidArgument(
-            "Paddings size %d and pooling size %d should be the same.",
-            paddings.size(), ksize.size()));
-
-    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
-    if (adaptive) {
-      output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
-    } else {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        if ((!ctx->IsRuntime()) && (in_x_dims[i + 2] < 0)) {
-          output_shape.push_back(in_x_dims[i + 2]);
-        } else {
-          output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i],
-                                                   paddings[i], strides[i]));
-        }
-      }
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(output_shape));
-    ctx->SetOutputDim("Mask", phi::make_ddim(output_shape));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -106,22 +45,6 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Mask"), true,
-        platform::errors::InvalidArgument("Input(Mask) must not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"), true,
-        platform::errors::InvalidArgument("Input(X) must not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Out@GRAD) should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
-                      platform::errors::InvalidArgument(
-                          "Output(X@GRAD) should not be null."));
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -335,40 +258,34 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(max_pool2d_with_index,
+                            MaxPool2dWithIndexInferShapeFunctor,
+                            PD_INFER_META(phi::MaxPoolWithIndexInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(max_pool2d_with_index_grad,
+                            MaxPool2dWithIndexGradInferShapeFunctor,
+                            PD_INFER_META(phi::MaxPoolWithIndexGradInferMeta));
+
 REGISTER_OPERATOR(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
                   ops::MaxPool2dWithIndexOpMaker,
                   ops::MaxPoolWithIndexGradOpMaker<paddle::framework::OpDesc>,
-                  ops::MaxPoolWithIndexGradOpMaker<paddle::imperative::OpBase>);
+                  ops::MaxPoolWithIndexGradOpMaker<paddle::imperative::OpBase>,
+                  MaxPool2dWithIndexInferShapeFunctor);
 REGISTER_OPERATOR(max_pool2d_with_index_grad, ops::MaxPoolWithIndexOpGrad,
-                  ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer);
+                  ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer,
+                  MaxPool2dWithIndexGradInferShapeFunctor);
 
-REGISTER_OP_CPU_KERNEL(
-    max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, double,
-                                int>);
-REGISTER_OP_CPU_KERNEL(
-    max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
-                                    int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
-                                    int>);
+DECLARE_INFER_SHAPE_FUNCTOR(max_pool3d_with_index,
+                            MaxPool3dWithIndexInferShapeFunctor,
+                            PD_INFER_META(phi::MaxPoolWithIndexInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(max_pool3d_with_index_grad,
+                            MaxPool3dWithIndexGradInferShapeFunctor,
+                            PD_INFER_META(phi::MaxPoolWithIndexGradInferMeta));
 
 REGISTER_OPERATOR(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
                   ops::MaxPool3dWithIndexOpMaker,
                   ops::MaxPoolWithIndexGradOpMaker<paddle::framework::OpDesc>,
-                  ops::MaxPoolWithIndexGradOpMaker<paddle::imperative::OpBase>);
+                  ops::MaxPoolWithIndexGradOpMaker<paddle::imperative::OpBase>,
+                  MaxPool3dWithIndexInferShapeFunctor);
 REGISTER_OPERATOR(max_pool3d_with_index_grad, ops::MaxPoolWithIndexOpGrad,
-                  ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, double,
-                                int>);
-REGISTER_OP_CPU_KERNEL(
-    max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
-                                    int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
-                                    int>);
+                  ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer,
+                  MaxPool3dWithIndexGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/pool_with_index_op.cu.cc b/paddle/fluid/operators/pool_with_index_op.cu.cc
deleted file mode 100644
index 5497dcbd9ce25..0000000000000
--- a/paddle/fluid/operators/pool_with_index_op.cu.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/pool_with_index_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, float,
-                                int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, double,
-                                int>);
-REGISTER_OP_CUDA_KERNEL(
-    max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
-                                    int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
-                                    int>);
-
-REGISTER_OP_CUDA_KERNEL(
-    max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, float,
-                                int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, double,
-                                int>);
-REGISTER_OP_CUDA_KERNEL(
-    max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
-                                    int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
-                                    int>);
diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h
deleted file mode 100644
index 6e51a833f5c89..0000000000000
--- a/paddle/fluid/operators/pool_with_index_op.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/pooling.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-    Tensor* mask = context.Output<Tensor>("Mask");
-
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    bool adaptive = context.Attr<bool>("adaptive");
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    if (context.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
-      }
-    }
-
-    switch (ksize.size()) {
-      case 2: {
-        paddle::operators::math::MaxPool2dWithIndexFunctor<DeviceContext, T1,
-                                                           T2>
-            pool2d_forward;
-        pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
-                       mask);
-      } break;
-      case 3: {
-        paddle::operators::math::MaxPool3dWithIndexFunctor<DeviceContext, T1,
-                                                           T2>
-            pool3d_forward;
-        pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
-                       mask);
-      } break;
-      default: {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Pool op only supports 2D and 3D input."));
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* mask = context.Input<Tensor>("Mask");
-    const Tensor* out_grad =
-        context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    bool adaptive = context.Attr<bool>("adaptive");
-    if (context.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x_grad->dims()[i + 2]);
-      }
-    }
-
-    if (in_x_grad) {
-      in_x_grad->mutable_data<T1>(context.GetPlace());
-      auto& device_ctx = context.template device_context<DeviceContext>();
-      phi::funcs::set_constant(device_ctx, in_x_grad, 0);
-
-      switch (ksize.size()) {
-        case 2: {
-          paddle::operators::math::MaxPool2dWithIndexGradFunctor<DeviceContext,
-                                                                 T1, T2>
-              pool2d_backward;
-          pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides,
-                          paddings, adaptive, in_x_grad);
-        } break;
-        case 3: {
-          paddle::operators::math::MaxPool3dWithIndexGradFunctor<DeviceContext,
-                                                                 T1, T2>
-              pool3d_backward;
-          pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
-                          paddings, adaptive, in_x_grad);
-        } break;
-        default: {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Pool op only supports 2D and 3D input."));
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index 9bd6ae8bab829..de35f67405810 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -9,14 +9,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/prelu_op.h"
-
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
+framework::OpKernelType innerGetKernelTypeForVar(
+    const Tensor &tensor, const framework::OpKernelType &expected_kernel_type) {
+#ifdef PADDLE_WITH_MKLDNN
+  auto isOneDNNKernelChosen =
+      (expected_kernel_type.data_layout_ == framework::DataLayout::kMKLDNN);
+  auto isNotOneDNNTensor = (tensor.layout() != framework::DataLayout::kMKLDNN);
+  auto isModelNHWC =
+      (paddle::platform::MKLDNNDeviceContext::tls()
+           .get_cur_paddle_data_layout() == framework::DataLayout::kNHWC);
+  // All inputs (including alpha) need shape rotating
+  if (isOneDNNKernelChosen && isNotOneDNNTensor && isModelNHWC) {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(),
+                                   framework::DataLayout::kNHWC);
+  }
+#endif
+  return framework::OpKernelType(expected_kernel_type.data_type_,
+                                 tensor.place(), tensor.layout());
+}
+
 class PReluOp : public framework::OperatorWithKernel {
  public:
   PReluOp(const std::string &type, const framework::VariableNameMap &inputs,
@@ -24,95 +49,6 @@ class PReluOp : public framework::OperatorWithKernel {
           const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "prelu");
-    OP_INOUT_CHECK(ctx->HasInput("Alpha"), "Input", "Alpha", "prelu");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "prelu");
-
-    auto x_dim = ctx->GetInputDim("X");
-    std::string mode = ctx->Attrs().Get<std::string>("mode");
-    if (mode == "all") {
-      PADDLE_ENFORCE_EQ(phi::product(ctx->GetInputDim("Alpha")), 1,
-                        platform::errors::InvalidArgument(
-                            "For mode 'all', size of weight Alpha must be one. "
-                            "But recevied alpha's size: %d.",
-                            product(ctx->GetInputDim("Alpha"))));
-    } else if (mode == "channel") {
-      auto x_rank = x_dim.size();
-      PADDLE_ENFORCE_GE(x_rank, 2,
-                        platform::errors::InvalidArgument(
-                            "For mode 'channel', rank of input X must be "
-                            "equal or larger than 2. But recevied X's "
-                            "rank: %d",
-                            x_rank));
-      const std::string data_format_str =
-          ctx->Attrs().Get<std::string>("data_format");
-      PADDLE_ENFORCE_EQ(data_format_str == "NCHW" || data_format_str == "NHWC",
-                        true,
-                        platform::errors::InvalidArgument(
-                            "For mode 'channel', data_format must be one of "
-                            "NCHW and NHWC. But recevied data_format: %s",
-                            data_format_str));
-      if (data_format_str == "NCHW") {
-        PADDLE_ENFORCE_EQ(
-            product(ctx->GetInputDim("Alpha")) == x_dim[1], true,
-            platform::errors::InvalidArgument(
-                "For mode 'channel', size of weight Alpha must be "
-                "equal to the number of channels of input(x). But "
-                "recevied alpha's size: %d, x_dim[1]: %d",
-                product(ctx->GetInputDim("Alpha")), x_dim[1]));
-      } else {
-        PADDLE_ENFORCE_EQ(
-            product(ctx->GetInputDim("Alpha")) == x_dim[x_rank - 1], true,
-            platform::errors::InvalidArgument(
-                "For mode 'channel', size of weight Alpha must be "
-                "equal to the number of channels of input(x). But "
-                "recevied alpha's size: %d, x_dim[%d]: %d",
-                product(ctx->GetInputDim("Alpha")), x_rank - 1,
-                x_dim[x_rank - 1]));
-      }
-
-    } else if (mode == "element") {
-      auto alpha_dim = ctx->GetInputDim("Alpha");
-      auto alpha_rank = alpha_dim.size();
-      auto x_rank = x_dim.size();
-      PADDLE_ENFORCE_GE(x_rank, 1,
-                        platform::errors::InvalidArgument(
-                            "For mode 'element', rank of input X must be "
-                            "equal or larger than 2. But recevied X's "
-                            "rank: %d",
-                            x_rank));
-      PADDLE_ENFORCE_EQ(
-          alpha_rank, x_rank,
-          platform::errors::InvalidArgument(
-              "For mode 'element', rank of weight Alpha must be ",
-              "equal to the rank of input(x). But recevied alpha's rank: %d, "
-              "x's rank: %d.",
-              alpha_rank, x_rank));
-      size_t x_product = 1;
-      size_t alpha_product = 1;
-      for (int64_t i = x_rank - 1; i > 0; i--) {
-        x_product *= x_dim[i];
-        alpha_product *= alpha_dim[i];
-      }
-      PADDLE_ENFORCE_EQ(
-          alpha_product, x_product,
-          platform::errors::InvalidArgument(
-              "For mode 'element', the size of weight Alpha must be "
-              "equal to the size of input(x). But recevied alpha's size: %d, "
-              "x's size: %d.",
-              alpha_product, x_product));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Attr(mode) of prelu must be one of 'all', 'channel', or 'element'. "
-          "But recevied "
-          "mode: '%s'.",
-          mode));
-    }
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -128,6 +64,12 @@ class PReluOp : public framework::OperatorWithKernel {
 #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const {
+    return innerGetKernelTypeForVar(tensor, expected_kernel_type);
+  }
 };
 
 class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -212,6 +154,12 @@ class PReluGradOp : public framework::OperatorWithKernel {
 #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const {
+    return innerGetKernelTypeForVar(tensor, expected_kernel_type);
+  }
 };
 
 template <typename T>
@@ -236,13 +184,10 @@ class PReluGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(prelu, PReluInferShapeFunctor,
+                            PD_INFER_META(phi::PReluInferMeta));
 REGISTER_OPERATOR(prelu, ops::PReluOp, ops::PReluOpMaker,
                   ops::PReluGradOpMaker<paddle::framework::OpDesc>,
-                  ops::PReluGradOpMaker<paddle::imperative::OpBase>);
+                  ops::PReluGradOpMaker<paddle::imperative::OpBase>,
+                  PReluInferShapeFunctor);
 REGISTER_OPERATOR(prelu_grad, ops::PReluGradOp);
-REGISTER_OP_CPU_KERNEL(
-    prelu, ops::PReluKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PReluKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    prelu_grad, ops::PReluGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PReluGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu
deleted file mode 100644
index 12e55d042d703..0000000000000
--- a/paddle/fluid/operators/prelu_op.cu
+++ /dev/null
@@ -1,208 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/prelu.h"
-#include "paddle/fluid/operators/prelu_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-#define CUDA_NUM_THREADS 1024
-
-inline static int PADDLE_GET_BLOCKS(const int N) {
-  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-}
-
-template <typename DeviceContext, typename T>
-class CUDAPReluKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* alpha = context.Input<Tensor>("Alpha");
-    auto* out = context.Output<Tensor>("Out");
-
-    const T* x_ptr = x->data<T>();
-    T* o_ptr = out->mutable_data<T>(context.GetPlace());
-
-    const T* alpha_ptr = alpha->data<T>();
-    auto& mode = context.Attr<std::string>("mode");
-    auto& data_format = context.Attr<std::string>("data_format");
-
-    int numel = x->numel();
-    auto dim = x->dims();
-    auto x_rank = dim.size();
-
-    VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1] << ", dim["
-            << x_rank - 1 << "]:" << dim[x_rank - 1] << ", numel:" << numel;
-
-    if (mode == "channel") {
-      bool channel_last = data_format == "NHWC";
-      size_t channel = channel_last ? dim[x_rank - 1] : dim[1];
-      math::PreluChannelWiseDirectCUDAFunctor<T> prelu_channel_wise;
-      prelu_channel_wise(context.cuda_device_context().stream(), x_ptr,
-                         alpha_ptr, o_ptr, dim[0], channel, channel_last,
-                         numel);
-    } else if (mode == "element") {
-      math::PreluElementWiseDirectCUDAFunctor<T> prelu_element_wise;
-      prelu_element_wise(context.cuda_device_context().stream(), x_ptr,
-                         alpha_ptr, o_ptr, dim[0], numel);
-    } else {
-      math::PreluScalarDirectCUDAFunctor<T> prelu_scalar;
-      prelu_scalar(context.cuda_device_context().stream(), x_ptr, alpha_ptr,
-                   o_ptr, numel);
-    }
-  }
-};
-
-enum PRELU_MODE { Element, ChannelFirst, ChannelLast, Scalar };
-
-template <typename T>
-__global__ void PReluOpGradKernel(const T* x_ptr, const T* alpha_ptr,
-                                  const T* dy_ptr, T* dx_ptr, T* dalpha_ptr,
-                                  size_t channel_num, size_t plane_size,
-                                  size_t spatial_size, size_t numel,
-                                  PRELU_MODE mode) {
-  CUDA_KERNEL_LOOP(index, numel) {
-    T scale;
-    if (mode == Element) {
-      size_t element_index = index % spatial_size;
-      scale = alpha_ptr[element_index];
-    } else if (mode == ChannelFirst) {
-      size_t temp = index / plane_size;
-      size_t channel_index = temp % channel_num;
-      scale = alpha_ptr[channel_index];
-    } else if (mode == ChannelLast) {
-      size_t channel_index = index % channel_num;
-      scale = alpha_ptr[channel_index];
-    } else {
-      scale = alpha_ptr[0];
-    }
-    T x = x_ptr[index];
-    T dy = dy_ptr[index];
-    T zero = static_cast<T>(0);
-    if (dx_ptr != nullptr) dx_ptr[index] = (x > zero) ? dy : scale * dy;
-    if (dalpha_ptr != nullptr) dalpha_ptr[index] = (x > zero) ? zero : x * dy;
-  }
-}
-
-template <typename T>
-class PreluOpGradFunctor {
- public:
-  void operator()(gpuStream_t stream, const T* x, const T* alpha, const T* dy,
-                  T* dx, T* dalpha, const framework::DDim& input_dims,
-                  PRELU_MODE mode) {
-    size_t numel = 1;
-    for (size_t i = 0; i < input_dims.size(); ++i) {
-      numel *= input_dims[i];
-    }
-    size_t plane_size = numel / input_dims[0] / input_dims[1];
-    size_t spatial_size = numel / input_dims[0];
-    size_t channel =
-        mode == ChannelLast ? input_dims[input_dims.size() - 1] : input_dims[1];
-
-    PReluOpGradKernel<
-        T><<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
-        x, alpha, dy, dx, dalpha, channel, plane_size, spatial_size, numel,
-        mode);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CUDAPReluGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* alpha = context.Input<Tensor>("Alpha");
-    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dalpha = context.Output<Tensor>(framework::GradVarName("Alpha"));
-
-    const T* x_ptr = x->data<T>();
-    const T* alpha_ptr = alpha->data<T>();
-    const T* dy_ptr = dy->data<T>();
-    T* dx_ptr = dx ? dx->mutable_data<T>(context.GetPlace()) : nullptr;
-    T* dalpha_ptr =
-        dalpha ? dalpha->mutable_data<T>(context.GetPlace()) : nullptr;
-
-    if (!dx && !dalpha) return;
-
-    auto& mode = context.Attr<std::string>("mode");
-    auto& data_format = context.Attr<std::string>("data_format");
-
-    int numel = x->numel();
-    auto dim = x->dims();
-    auto x_rank = dim.size();
-    std::vector<int> input_shape = phi::vectorize<int>(dim);
-    auto stream = context.cuda_device_context().stream();
-
-    T* dalpha_tmp_ptr;
-    Tensor dalpha_tmp;
-    if (dalpha_ptr == nullptr) {
-      dalpha_tmp_ptr = dalpha_ptr;
-    } else {
-      auto& dev_ctx = context.template device_context<DeviceContext>();
-      dalpha_tmp = context.AllocateTmpTensor<T, DeviceContext>(dim, dev_ctx);
-      dalpha_tmp_ptr = dalpha_tmp.mutable_data<T>(context.GetPlace());
-    }
-
-    PRELU_MODE m;
-    bool channel_last = false;
-    if (mode == "element") {
-      m = Element;
-    } else if (mode == "channel") {
-      channel_last = data_format == "NHWC";
-      m = channel_last ? ChannelLast : ChannelFirst;
-    } else {
-      m = Scalar;
-    }
-    PreluOpGradFunctor<T> prelu_grad;
-    prelu_grad(stream, x_ptr, alpha_ptr, dy_ptr, dx_ptr, dalpha_tmp_ptr, dim,
-               m);
-
-    if (dalpha_tmp_ptr == nullptr) return;
-
-    std::vector<int> reduce_dims;
-    for (size_t i = 0; i < dim.size(); i++) {
-      if (mode == "channel" && !channel_last && i == 1) continue;
-      if (mode == "channel" && channel_last && i == dim.size() - 1) continue;
-      if (mode == "element" && i != 0) continue;
-      reduce_dims.push_back(i);
-    }
-
-    TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        context.cuda_device_context(), dalpha_tmp, dalpha,
-        kps::IdentityFunctor<T>(), reduce_dims, stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    prelu, ops::CUDAPReluKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CUDAPReluKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::CUDAPReluKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    prelu_grad,
-    ops::CUDAPReluGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CUDAPReluGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>,
-    ops::CUDAPReluGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/prelu_op.h b/paddle/fluid/operators/prelu_op.h
deleted file mode 100644
index 384994eb37c2a..0000000000000
--- a/paddle/fluid/operators/prelu_op.h
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/transform.h"
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using platform::Transform;
-
-template <typename DeviceContext, typename T>
-class PReluKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* alpha = context.Input<Tensor>("Alpha");
-    auto* out = context.Output<Tensor>("Out");
-
-    const T* x_ptr = x->data<T>();
-    T* o_ptr = out->mutable_data<T>(context.GetPlace());
-
-    const T* alpha_ptr = alpha->data<T>();
-    auto& mode = context.Attr<std::string>("mode");
-    auto& data_format = context.Attr<std::string>("data_format");
-
-    int numel = x->numel();
-    auto dim = x->dims();
-    int index = 0;
-    int i = 0;
-    if (mode == "channel") {
-      if (data_format == "NCHW") {
-        int temp = 1;
-        for (int j = 2; j < dim.size(); j++) {
-          temp *= dim[j];
-        }
-        for (i = 0; i < numel; i++) {
-          index = (i / temp) % dim[1];
-          o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
-        }
-      } else {
-        for (i = 0; i < numel; i++) {
-          index = i % dim[dim.size() - 1];
-          o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
-        }
-      }
-    } else if (mode == "element") {
-      int temp = 1;
-      for (int j = 1; j < dim.size(); j++) {
-        temp *= dim[j];
-      }
-      for (i = 0; i < numel; i++) {
-        index = i % temp;
-        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
-      }
-    } else {
-      for (i = 0; i < numel; i++) {
-        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i];
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PReluGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dalpha = context.Output<Tensor>(framework::GradVarName("Alpha"));
-    auto* alpha = context.Input<Tensor>("Alpha");
-    const T* alpha_ptr = alpha->data<T>();
-    const T* x_ptr = x->data<T>();
-    const T* dout_ptr = dout->data<T>();
-    std::string mode = context.Attr<std::string>("mode");
-    auto& data_format = context.Attr<std::string>("data_format");
-    int numel = x->numel();
-    auto dim = x->dims();
-    int index = 0;
-    int i = 0;
-    if (dx) {
-      T* dx_ptr = dx->mutable_data<T>(context.GetPlace());
-      if (mode == "channel") {
-        if (data_format == "NCHW") {
-          int temp = 1;
-          for (int j = 2; j < dim.size(); j++) {
-            temp *= dim[j];
-          }
-          for (i = 0; i < numel; i++) {
-            index = (i / temp) % dim[1];
-            dx_ptr[i] =
-                x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i];
-          }
-        } else {
-          for (i = 0; i < numel; i++) {
-            index = i % dim[dim.size() - 1];
-            dx_ptr[i] =
-                x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i];
-          }
-        }
-      } else if (mode == "element") {
-        int temp = 1;
-        for (int j = 1; j < dim.size(); j++) {
-          temp *= dim[j];
-        }
-        for (i = 0; i < numel; i++) {
-          index = i % temp;
-          dx_ptr[i] =
-              x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i];
-        }
-      } else {
-        for (i = 0; i < numel; i++) {
-          dx_ptr[i] = x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[0] * dout_ptr[i];
-        }
-      }
-    }
-
-    index = 0;
-    if (dalpha) {
-      T* dalpha_ptr = dalpha->mutable_data<T>(context.GetPlace());
-      memset(dalpha_ptr, 0, sizeof(T) * dalpha->numel());
-
-      if (mode == "channel") {
-        if (data_format == "NCHW") {
-          int temp = 1;
-          for (int j = 2; j < dim.size(); j++) {
-            temp *= dim[j];
-          }
-          for (i = 0; i < numel; i++) {
-            index = (i / temp) % dim[1];
-            dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
-          }
-        } else {
-          for (i = 0; i < numel; i++) {
-            index = i % dim[dim.size() - 1];
-            dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
-          }
-        }
-      } else if (mode == "element") {
-        int temp = 1;
-        for (int j = 1; j < dim.size(); j++) {
-          temp *= dim[j];
-        }
-        for (i = 0; i < numel; i++) {
-          index = i % temp;
-          dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
-        }
-      } else {
-        for (i = 0; i < numel; i++) {
-          dalpha_ptr[0] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
-        }
-      }
-    }
-
-    // TODO(Guanzhong): add GPU kernels
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc
index da637dfeb237d..cfacffff23410 100644
--- a/paddle/fluid/operators/psroi_pool_op.cc
+++ b/paddle/fluid/operators/psroi_pool_op.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/psroi_pool_op.h"
-#include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
 class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -82,75 +82,6 @@ class PSROIPoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of PSROIPoolOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(ROIs) of PSROIPoolOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of PSROIPoolOp should not be null."));
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-
-    PADDLE_ENFORCE_EQ(input_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "The format of input tensor is NCHW"));
-    PADDLE_ENFORCE_EQ(
-        rois_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
-            "given as [(x1, y1, x2, y2), ...]"));
-    PADDLE_ENFORCE_EQ(
-        rois_dims[1], 4,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
-            "given as [(x1, y1, x2, y2), ...]"));
-    if (ctx->HasInput("RoisNum")) {
-      auto rois_num_dims = ctx->GetInputDim("RoisNum");
-      PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "The second dimension of RoisNum should "
-                            "be 1, but received dimension is %d",
-                            rois_num_dims.size()));
-    }
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    int output_channels = ctx->Attrs().Get<int>("output_channels");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-
-    PADDLE_ENFORCE_EQ(
-        input_dims[1], output_channels * pooled_height * pooled_width,
-        platform::errors::InvalidArgument(
-            "the channel of X(%d) "
-            "should be equal to the product of "
-            "output_channels(%d), pooled_height(%d) and pooled_width(%d)",
-            input_dims[1], output_channels, pooled_height, pooled_width));
-
-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      platform::errors::InvalidArgument(
-                          "The pooled output height must be greater than 0"));
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      platform::errors::InvalidArgument(
-                          "The pooled output width must be greater than 0"));
-    PADDLE_ENFORCE_GT(output_channels, 1,
-                      platform::errors::InvalidArgument(
-                          "The pooled output channels must greater than 1"));
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      platform::errors::InvalidArgument(
-                          "The spatial scale must greater than 0."));
-
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] =
-        output_channels;  // input_dims[1] / (pooled_height * pooled_width);
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-    ctx->SetOutputDim("Out", out_dims);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -164,16 +95,6 @@ class PSROIPoolGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      platform::errors::InvalidArgument(
-                          "The gradient of Out should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
-                      platform::errors::InvalidArgument(
-                          "The gradient of X should not be null."));
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -204,15 +125,13 @@ class PSROIPoolGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(psroi_pool, PsroiPoolInferShapeFunctor,
+                            PD_INFER_META(phi::PsroiPoolInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(psroi_pool_grad, PsroiPoolGradInferShapeFunctor,
+                            PD_INFER_META(phi::PsroiPoolGradInferMeta));
 REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker,
                   ops::PSROIPoolGradMaker<paddle::framework::OpDesc>,
-                  ops::PSROIPoolGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp);
-REGISTER_OP_CPU_KERNEL(
-    psroi_pool,
-    ops::CPUPSROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUPSROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    psroi_pool_grad,
-    ops::CPUPSROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUPSROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>);
+                  ops::PSROIPoolGradMaker<paddle::imperative::OpBase>,
+                  PsroiPoolInferShapeFunctor);
+REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp,
+                  PsroiPoolGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu
deleted file mode 100644
index c1917501db8b5..0000000000000
--- a/paddle/fluid/operators/psroi_pool_op.cu
+++ /dev/null
@@ -1,350 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/psroi_pool_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaximumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaximumNumBlocks);
-}
-
-template <typename T>
-__global__ void GPUPSROIPoolForward(
-    const int nthreads, const T* input_data, const T* input_rois,
-    const float spatial_scale, const int input_channels, const int height,
-    const int width, const int output_channels, const int pooled_height,
-    const int pooled_width, const int* rois_batch_id_data, T* output_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    // The output is in order (n, c, ph, pw)
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % output_channels;
-    int n = i / pooled_width / pooled_height / output_channels;
-
-    // set roi_batch_id
-    int roi_batch_id = rois_batch_id_data[n];
-
-    // [start, end) interval for spatial sampling
-    const T* offset_input_rois = input_rois + n * 4;
-    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-    T roi_end_w =
-        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-    T roi_end_h =
-        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-    // Force too small ROIs to be 1x1
-    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
-
-    // Compute w and h at input feature map
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
-    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
-    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
-    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
-
-    // Add roi offsets and clip to input boundaries
-    hstart = min(max(hstart, 0), height);
-    hend = min(max(hend, 0), height);
-    wstart = min(max(wstart, 0), width);
-    wend = min(max(wend, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-    const T* offset_input_data =
-        input_data +
-        (roi_batch_id * input_channels + input_channel) * height * width;
-    T outsum = 0;
-
-    for (int ih = hstart; ih < hend; ++ih) {
-      for (int iw = wstart; iw < wend; ++iw) {
-        int input_index = ih * width + iw;
-        outsum += offset_input_data[input_index];
-      }
-    }
-
-    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
-    output_data[i] = is_empty ? 0. : outsum / bin_area;
-  }
-}
-
-template <typename T>
-__global__ void GPUPSROIPoolBackward(
-    const int nthreads, const T* input_rois, const T* output_grad_data,
-    const float spatial_scale, const int input_channels, const int height,
-    const int width, const int output_channels, const int pooled_height,
-    const int pooled_width, const int* rois_batch_id_data, T* input_grad_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    // The output is in order (n, c, ph, pw)
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % output_channels;
-    int n = i / pooled_width / pooled_height / output_channels;
-
-    // set roi_batch_id
-    int roi_batch_id = rois_batch_id_data[n];
-    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-    int input_offset =
-        (roi_batch_id * input_channels + input_channel) * height * width;
-    T* offset_input_grad_data = input_grad_data + input_offset;
-
-    // [start, end) interval for spatial sampling
-    const T* offset_input_rois = input_rois + n * 4;
-    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-    T roi_end_w =
-        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-    T roi_end_h =
-        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-    // Force too small ROIs to be 1x1
-    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
-
-    // Compute w and h at input feature map
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
-    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
-    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
-    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
-
-    // Add roi offsets and clip to input boundaries
-    hstart = min(max(hstart, 0), height);
-    hend = min(max(hend, 0), height);
-    wstart = min(max(wstart, 0), width);
-    wend = min(max(wend, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    // Accumulate diff_val into input data
-    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
-    T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area;
-    for (int ih = hstart; ih < hend; ++ih) {
-      for (int iw = wstart; iw < wend; ++iw) {
-        int input_index = ih * width + iw;
-        platform::CudaAtomicAdd(offset_input_grad_data + input_index, diff_val);
-      }
-    }
-  }
-}
-
-template <typename Place, typename T>
-class GPUPSROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int input_channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    PADDLE_ENFORCE_EQ(
-        input_channels, output_channels * pooled_height * pooled_width,
-        platform::errors::InvalidArgument(
-            "The channels %d of input X should equal the product of "
-            "output_channels %d x pooled_height %d x pooled_width %d.",
-            input_channels, output_channels, pooled_height, pooled_width));
-
-    int rois_num = rois->dims()[0];
-    if (rois_num == 0) return;
-    int rois_batch_size;
-    framework::Tensor rois_batch_id_list;
-    rois_batch_id_list.Resize({rois_num});
-    int* rois_batch_id_data =
-        rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      auto* rois_num_data = rois_num_t->data<int>();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of input(ROIs) and input(X) must be "
-              "the same but received batch size of input(ROIs) and "
-              "input(X) is %d and %d respectively.",
-              rois_batch_size, batch_size));
-      std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(),
-                   rois_num_data, sizeof(int) * rois_batch_size, 0);
-      int rois_num_count = 0;
-      for (int i = 0; i < rois_batch_size; ++i) {
-        rois_num_count += rois_num_list[i];
-      }
-      PADDLE_ENFORCE_EQ(
-          rois_num_count, rois_num,
-          platform::errors::InvalidArgument(
-              "the rois_num from input and RoisNum must be the same"));
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_list[n]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-        start += rois_num_list[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of input(ROIs) and input(X) must be "
-              "the same but received batch size of input(ROIs) and "
-              "input(X) is %d and %d respectively.",
-              rois_batch_size, batch_size));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                        platform::errors::InvalidArgument(
-                            "The number of rois from input(ROIs) and its LOD "
-                            "must be the same. Received rois %d of input(ROIs) "
-                            "but the number of rois %d from its LOD is %d",
-                            rois_num, rois_num_with_lod));
-
-      // set rois batch id
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-      }
-    }
-    framework::Tensor rois_batch_id_list_gpu;
-    framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
-                          ctx.device_context(), &rois_batch_id_list_gpu);
-
-    int output_size = out->numel();
-    int blocks = NumBlocks(output_size);
-    int threads = kNumCUDAThreads;
-
-    // call cuda kernel function
-    GPUPSROIPoolForward<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-        output_size, in->data<T>(), rois->data<T>(), spatial_scale,
-        input_channels, height, width, output_channels, pooled_height,
-        pooled_width, rois_batch_id_list_gpu.data<int>(),
-        out->mutable_data<T>(ctx.GetPlace()));
-  }
-};
-
-template <typename Place, typename T>
-class GPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    int rois_num = rois->dims()[0];
-    int input_channels = in->dims()[1];
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (input_grad) {
-      // set roi batch id
-      framework::Tensor rois_batch_id_list;
-      rois_batch_id_list.Resize({rois_num});
-      int* rois_batch_id_data =
-          rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-      int rois_batch_size;
-      if (ctx.HasInput("RoisNum")) {
-        auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-        rois_batch_size = rois_num_t->numel();
-        std::vector<int> rois_num_list(rois_batch_size);
-        memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(),
-                     rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-        int start = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int i = start; i < start + rois_num_list[n]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-          start += rois_num_list[n];
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        rois_batch_size = rois_lod.size() - 1;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-        }
-      }
-      framework::Tensor rois_batch_id_list_gpu;
-      framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
-                            ctx.device_context(), &rois_batch_id_list_gpu);
-
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<Place, T> set_zero;
-      set_zero(ctx.cuda_device_context(), input_grad, static_cast<T>(0));
-
-      int output_grad_size = output_grad->numel();
-      int blocks = NumBlocks(output_grad_size);
-      int threads = kNumCUDAThreads;
-
-      if (output_grad_size > 0) {
-        GPUPSROIPoolBackward<
-            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-            output_grad_size, rois->data<T>(), output_grad->data<T>(),
-            spatial_scale, input_channels, height, width, output_channels,
-            pooled_height, pooled_width, rois_batch_id_list_gpu.data<int>(),
-            input_grad->mutable_data<T>(ctx.GetPlace()));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    psroi_pool,
-    ops::GPUPSROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUPSROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    psroi_pool_grad,
-    ops::GPUPSROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUPSROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h
deleted file mode 100644
index 3f020d93391b0..0000000000000
--- a/paddle/fluid/operators/psroi_pool_op.h
+++ /dev/null
@@ -1,295 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class CPUPSROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto output_channels = ctx.Attr<int>("output_channels");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int input_channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    PADDLE_ENFORCE_EQ(input_channels,
-                      output_channels * pooled_height * pooled_width,
-                      platform::errors::InvalidArgument(
-                          "the channels of input "
-                          "X should equal the product of "
-                          "output_channels x pooled_height x pooled_width"));
-
-    auto in_stride = phi::stride(in_dims);
-    auto out_stride = phi::stride(out->dims());
-
-    const T* input_data = in->data<T>();
-
-    framework::Tensor rois_batch_id_list;
-    rois_batch_id_list.Resize({rois_num});
-    int* rois_batch_id_data =
-        rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-    int rois_batch_size;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      auto* rois_num_data = rois_num_t->data<int>();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of rois and the batch size of images "
-              " must be the same. But received the batch size of rois is %d, "
-              "and the batch size of images is %d",
-              rois_batch_size, batch_size));
-      int rois_num_count = 0;
-      for (int i = 0; i < rois_batch_size; ++i) {
-        rois_num_count += rois_num_data[i];
-      }
-      PADDLE_ENFORCE_EQ(
-          rois_num_count, rois_num,
-          platform::errors::InvalidArgument(
-              "the rois_num from input and RoisNum must be the same"));
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_data[n]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-        start += rois_num_data[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument("the rois_batch_size and input(X) "
-                                            "batch_size should be the same."));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num_with_lod, rois_num,
-          platform::errors::InvalidArgument(
-              "the rois_num from input and lod must be the same"));
-      // calculate batch id index for each roi according to LoD
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-      }
-    }
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    const T* input_rois = rois->data<T>();
-
-    // calculate psroipooling, parallel processing can be implemented per ROI
-    for (int n = 0; n < rois_num; ++n) {
-      // set roi batch id
-      int roi_batch_id = rois_batch_id_data[n];
-
-      // [start, end) interval for spatial sampling
-      const T* offset_input_rois = input_rois + n * 4;
-      T roi_start_w =
-          static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-      T roi_start_h =
-          static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-      T roi_end_w =
-          static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-      T roi_end_h =
-          static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-      // Force too small rois to be 1 x 1
-      T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-      T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
-
-      // Compute bin size w and h at input feature map
-      T bin_size_h = roi_height / static_cast<T>(pooled_height);
-      T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-      // calculate each pixel of the output feature map.
-      int out_roi_offset = n * out_stride[0];
-      for (int c = 0; c < output_channels; ++c) {
-        // per category
-        int out_plane_offset = out_roi_offset + c * out_stride[1];
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          int out_row_offset = out_plane_offset + ph * out_stride[2];
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            // calculate w and h at input feature map
-            int hstart = floor(static_cast<T>(ph) * bin_size_h + roi_start_h);
-            int wstart = floor(static_cast<T>(pw) * bin_size_w + roi_start_w);
-            int hend = ceil(static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
-            int wend = ceil(static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
-            //  Add roi offsets and clip to input boundaries
-            hstart = std::min(std::max(hstart, 0), height);
-            wstart = std::min(std::max(wstart, 0), width);
-            hend = std::min(std::max(hend, 0), height);
-            wend = std::min(std::max(wend, 0), width);
-
-            int output_index = out_row_offset + pw;
-            int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-            int input_plane_offset =
-                roi_batch_id * in_stride[0] + input_channel * in_stride[1];
-            const T* offset_input_data = input_data + input_plane_offset;
-            T out_sum = 0.;
-            bool is_empty = (hend <= hstart) || (wend <= wstart);
-            for (int ih = hstart; ih < hend; ++ih) {
-              for (int iw = wstart; iw < wend; ++iw) {
-                int input_index = ih * in_stride[2] + iw;
-                out_sum += offset_input_data[input_index];
-              }
-            }
-            T bin_area = (hend - hstart) * (wend - wstart);
-            output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
-          }
-        }
-      }
-    }
-    return;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* output_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    if (input_grad) {
-      auto in_dims = in->dims();
-      int input_channels = in_dims[1];
-      int height = in_dims[2];
-      int width = in_dims[3];
-      int rois_num = rois->dims()[0];
-
-      // set roi batch id
-      framework::Tensor rois_batch_id_list;
-      rois_batch_id_list.Resize({rois_num});
-      int* rois_batch_id_data =
-          rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-      int rois_batch_size;
-      if (ctx.HasInput("RoisNum")) {
-        auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-        rois_batch_size = rois_num_t->numel();
-        auto* rois_num_data = rois_num_t->data<int>();
-        int start = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int i = start; i < start + rois_num_data[n]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-          start += rois_num_data[n];
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        rois_batch_size = rois_lod.size() - 1;
-        // calculate batch id index for each roi according to LoD
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-        }
-      }
-      const T* input_rois = rois->data<T>();
-      const T* output_grad_data = output_grad->data<T>();
-      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-
-      // set gradient of X to be 0. before backpropagate.
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.template device_context<DeviceContext>(), input_grad,
-               static_cast<T>(0));
-
-      // backpropagate gradient per output pixel
-      int output_grad_size = output_grad->numel();
-      for (int i = 0; i < output_grad_size; ++i) {
-        // The output is in order (n, c, ph, pw)
-        int pw = i % pooled_width;
-        int ph = (i / pooled_width) % pooled_height;
-        int c = (i / pooled_width / pooled_height) % output_channels;
-        int n = i / pooled_width / pooled_height / output_channels;
-
-        // set roi_batch_id
-        int roi_batch_id = rois_batch_id_data[n];
-        int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-        int input_offset =
-            (roi_batch_id * input_channels + input_channel) * height * width;
-        T* offset_input_grad_data = input_grad_data + input_offset;
-
-        // [start, end) interval for spatial sampling
-        const T* offset_input_rois = input_rois + n * 4;
-        T roi_start_w =
-            static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-        T roi_start_h =
-            static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-        T roi_end_w =
-            static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-        T roi_end_h =
-            static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-        // Force too small ROIs to be 1x1
-        T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-        T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
-
-        // Compute w and h at input feature map
-        T bin_size_h = roi_height / static_cast<T>(pooled_height);
-        T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-        int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
-        int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
-        int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
-        int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
-
-        // Add roi offsets and clip to input boundaries
-        hstart = std::min(std::max(hstart, 0), height);
-        hend = std::min(std::max(hend, 0), height);
-        wstart = std::min(std::max(wstart, 0), width);
-        wend = std::min(std::max(wend, 0), width);
-        bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-        // Accumulate diff_val into input data
-        T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
-        T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area;
-        for (int ih = hstart; ih < hend; ++ih) {
-          for (int iw = wstart; iw < wend; ++iw) {
-            int input_index = ih * width + iw;
-            offset_input_grad_data[input_index] += diff_val;
-          }
-        }
-      }
-    }
-    return;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/qr_op.cc b/paddle/fluid/operators/qr_op.cc
index 40e3cbde3b009..82fc9ef1b7858 100644
--- a/paddle/fluid/operators/qr_op.cc
+++ b/paddle/fluid/operators/qr_op.cc
@@ -145,8 +145,6 @@ REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker,
 
 REGISTER_OPERATOR(qr_grad, ops::QrGradOp);
 
-REGISTER_OP_CPU_KERNEL(qr, ops::QrCPUKernel<float>, ops::QrCPUKernel<double>);
-
 REGISTER_OP_CPU_KERNEL(
     qr_grad, ops::QrGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::QrGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h
index f09a07e96cd34..5ef02d8942797 100644
--- a/paddle/fluid/operators/qr_op.h
+++ b/paddle/fluid/operators/qr_op.h
@@ -48,85 +48,6 @@ static inline std::tuple<bool, bool> _parse_qr_mode(std::string mode) {
   return std::make_tuple(compute_q, reduced);
 }
 
-template <typename T>
-class QrCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool compute_q;
-    bool reduced_mode;
-    const Tensor& x = *context.Input<Tensor>("X");
-    Tensor& q = *context.Output<Tensor>("Q");
-    Tensor& r = *context.Output<Tensor>("R");
-    std::string mode = context.Attr<std::string>("mode");
-    std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode);
-
-    auto numel = x.numel();
-    PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet(
-                                    "The input of QR is empty."));
-    auto x_dims = x.dims();
-    int x_rank = x_dims.size();
-    int m = x_dims[x_rank - 2];
-    int n = x_dims[x_rank - 1];
-    int min_mn = std::min(m, n);
-    int k = reduced_mode ? min_mn : m;
-    int batch_size = numel / (m * n);
-    int x_stride = m * n;
-    int q_stride = m * k;
-    int r_stride = k * n;
-
-    auto* x_data = x.data<phi::dtype::Real<T>>();
-    T* q_data = nullptr;
-    if (compute_q) {
-      q_data = q.mutable_data<phi::dtype::Real<T>>(
-          context.GetPlace(),
-          size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
-      memset(q_data, 0,
-             size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
-    }
-    auto* r_data = r.mutable_data<phi::dtype::Real<T>>(
-        context.GetPlace(),
-        size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
-    memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
-
-    // Implement QR by calling Eigen
-    for (int i = 0; i < batch_size; ++i) {
-      const T* x_matrix_ptr = x_data + i * x_stride;
-      T* r_matrix_ptr = r_data + i * r_stride;
-      using EigenDynamicMatrix =
-          Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-      auto x_matrix = Eigen::Map<const EigenDynamicMatrix>(x_matrix_ptr, m, n);
-      Eigen::HouseholderQR<EigenDynamicMatrix> qr(x_matrix);
-      if (reduced_mode) {
-        auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n);
-        auto r_matrix_view =
-            qr_top_matrix.template triangularView<Eigen::Upper>();
-        auto r_matrix = EigenDynamicMatrix(r_matrix_view);
-        memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
-      } else {
-        auto r_matrix_view =
-            qr.matrixQR().template triangularView<Eigen::Upper>();
-        auto r_matrix = EigenDynamicMatrix(r_matrix_view);
-        memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
-      }
-
-      if (compute_q) {
-        T* q_matrix_ptr = q_data + i * q_stride;
-        if (reduced_mode) {
-          auto q_matrix =
-              qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn);
-          q_matrix.transposeInPlace();
-          memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
-        } else {
-          auto q_matrix =
-              qr.householderQ() * EigenDynamicMatrix::Identity(m, m);
-          q_matrix.transposeInPlace();
-          memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
-        }
-      }
-    }
-  }
-};
-
 template <typename DeviceContext, typename T>
 class QrGradKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc
index 24741efe426b1..c7e91ba35dee1 100644
--- a/paddle/fluid/operators/range_op_npu_test.cc
+++ b/paddle/fluid/operators/range_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
index 0a5d54e72c845..83a21a919dcaa 100644
--- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
+++ b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
@@ -12,9 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/frobenius_norm_op.h"
-
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace framework {
@@ -56,22 +59,12 @@ class FrobeniusNormOpMaker : public ops::ReduceOpMaker {
   virtual std::string GetOpType() const { return "Reduce frobenius_norm"; }
 };
 
+DECLARE_INFER_SHAPE_FUNCTOR(frobenius_norm, FrobeniusNormInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
+
 REGISTER_OPERATOR(frobenius_norm, ops::ReduceOp, FrobeniusNormOpMaker,
                   ops::FrobeniusNormOpGradMaker<paddle::framework::OpDesc>,
-                  ops::FrobeniusNormOpGradMaker<paddle::imperative::OpBase>);
+                  ops::FrobeniusNormOpGradMaker<paddle::imperative::OpBase>,
+                  FrobeniusNormInferShapeFunctor);
 
 REGISTER_OPERATOR(frobenius_norm_grad, ops::ReduceGradOp);
-
-REGISTER_OP_CPU_KERNEL(frobenius_norm,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         float, ops::FrobeniusNormFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         double, ops::FrobeniusNormFunctor>);
-
-template <typename T>
-using CPUFrobeniusNormGradKernel =
-    ops::FrobeniusNormGradKernel<paddle::platform::CPUDeviceContext, T,
-                                 ops::FrobeniusNormGradFunctor>;
-
-REGISTER_OP_CPU_KERNEL(frobenius_norm_grad, CPUFrobeniusNormGradKernel<float>,
-                       CPUFrobeniusNormGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cu b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cu
deleted file mode 100644
index b2cef09df9436..0000000000000
--- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/frobenius_norm_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-
-template <typename T>
-using CUDAFrobeniusNormKernel =
-    ops::ReduceKernel<paddle::platform::CUDADeviceContext, T,
-                      ops::FrobeniusNormFunctor>;
-
-REGISTER_OP_CUDA_KERNEL(frobenius_norm, CUDAFrobeniusNormKernel<float>,
-                        CUDAFrobeniusNormKernel<double>);
-
-template <typename T>
-using CUDAFrobeniusNormGradKernel =
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, T,
-                          ops::FrobeniusNormGradFunctor>;
-
-REGISTER_OP_CUDA_KERNEL(frobenius_norm_grad, CUDAFrobeniusNormGradKernel<float>,
-                        CUDAFrobeniusNormGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.h b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.h
deleted file mode 100644
index 0b6b87d99ecd9..0000000000000
--- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.h
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-// \partial \| X \|_F = \frac{X}{ \| X \|_F }
-template <typename DeviceContext, typename T, typename Functor>
-class FrobeniusNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // default use Eigen broadcast
-    ReduceGradKernel<DeviceContext, T, Functor, false> kernel;
-    kernel.Compute(context);
-  }
-};
-
-struct FrobeniusNormFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = ((x->square()).sum(dim)).sqrt();
-  }
-};
-
-struct FrobeniusNormGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    dx->device(place) = y->broadcast(dim);
-    dx->device(place) = *dx + dx->constant(1e-12f);
-    dx->device(place) = (*x / *dx) * (dy->broadcast(dim));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
index 955cf8d4448c1..9115d21b195e1 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
@@ -14,6 +14,10 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -28,9 +32,17 @@ class CPUDeviceContext;
 }  // namespace platform
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_all, ReduceAllInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
+class ReduceAllOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_all"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_all"; }
+};
 // kernel's device type is decided by input tensor place, to be consistent with
 // compare and logical ops
-REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_all, UseInputPlace);
-REGISTER_OP_CPU_KERNEL(reduce_all,
-                       ops::BoolReduceKernel<paddle::platform::CPUDeviceContext,
-                                             bool, ops::AllFunctor>);
+REGISTER_OPERATOR(
+    reduce_all, ops::ReduceOpUseInputPlace, ReduceAllOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ReduceAllInferShapeFunctor);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
index fa3800dd3c9e4..69561b9349888 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
@@ -14,6 +14,9 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -28,9 +31,18 @@ class CPUDeviceContext;
 }  // namespace platform
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_any, ReduceAnyInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
+
+class ReduceAnyOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_any"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_any"; }
+};
 // kernel's device type is decided by input tensor place, to be consistent with
 // compare and logical ops
-REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_any, UseInputPlace);
-REGISTER_OP_CPU_KERNEL(reduce_any,
-                       ops::BoolReduceKernel<paddle::platform::CPUDeviceContext,
-                                             bool, ops::AnyFunctor>);
+REGISTER_OPERATOR(
+    reduce_any, ops::ReduceOpUseInputPlace, ReduceAnyOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ReduceAnyInferShapeFunctor);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
index d057ee8f5d798..e327d19ab3be8 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
@@ -35,7 +35,7 @@ namespace p = paddle::platform;
 
 using Tensor = paddle::framework::Tensor;
 
-USE_OP(reduce_any);
+USE_OP_ITSELF(reduce_any);
 USE_OP_DEVICE_KERNEL(reduce_any, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
index 41df8e4a15f09..15812778e0023 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
@@ -35,13 +35,3 @@ REGISTER_OPERATOR(
     paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
     ReduceMaxInferShapeFunctor);
 REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp)
-
-REGISTER_OP_CPU_KERNEL(
-    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu
deleted file mode 100644
index 5ee38b8fa4629..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index 4a18330913803..dc41979defb93 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -107,12 +107,3 @@ REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp,
                   ops::ReduceMeanDoubleGradDescMaker,
                   ops::ReduceMeanDoubleGradOpBaseMaker,
                   ops::ReduceMeanGradNoNeedBufferVarInferer);
-
-template <typename T>
-using CPUReduceMeanGradKernel =
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, T,
-                          ops::MeanGradFunctor, true>;
-
-REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel<bool>,
-                       CPUReduceMeanGradKernel<float>,
-                       CPUReduceMeanGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
index 11aa78382e319..5e5b04d57b002 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
@@ -14,21 +14,24 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 
-REGISTER_REDUCE_OP(reduce_min);
-REGISTER_OP_CPU_KERNEL(
-    reduce_min, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
-                                  ops::MinFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
-                      ops::MinFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MinFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
-                      ops::MinFunctor>);
-REGISTER_OP_CPU_KERNEL(
-    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace ops = paddle::operators;
+
+class ReduceMinOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_min"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_min"; }
+};
+
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_min, ReduceMinInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
+
+REGISTER_OPERATOR(
+    reduce_min, ops::ReduceOp, ReduceMinOpMaker,
+    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
+    ReduceMinInferShapeFunctor);
+REGISTER_OPERATOR(reduce_min_grad, ops::ReduceGradOp)
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
deleted file mode 100644
index 44548b8d2e778..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-
-// reduce_min
-REGISTER_OP_CUDA_KERNEL(
-    reduce_min,
-    ops::ReduceCudaKernel<float, kps::MinFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<double, kps::MinFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int, kps::MinFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int64_t, kps::MinFunctor, kps::IdentityFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu
deleted file mode 100644
index bf886063786a8..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
index eb745ab9c56c5..b1abdf9e8a758 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
@@ -14,6 +14,10 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -26,14 +30,20 @@ class CPUDeviceContext;
 }  // namespace platform
 }  // namespace paddle
 
-REGISTER_REDUCE_OP(reduce_prod);
+namespace ops = paddle::operators;
+
+class ReduceProdOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_prod"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_prod"; }
+};
+
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_prod, ReduceProdInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
 
-REGISTER_OP_CPU_KERNEL(reduce_prod_grad,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             float, ops::ProdGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             double, ops::ProdGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             int, ops::ProdGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             int64_t, ops::ProdGradFunctor>);
+REGISTER_OPERATOR(
+    reduce_prod, ops::ReduceOp, ReduceProdOpMaker,
+    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
+    ReduceProdInferShapeFunctor);
+REGISTER_OPERATOR(reduce_prod_grad, ops::ReduceGradOp);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu
deleted file mode 100644
index 0610cdd94f89c..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                            float, ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index 6441d53239e95..2a78774f3706e 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -114,16 +114,3 @@ REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp,
                   ops::ReduceSumDoubleOpGradMaker<paddle::framework::OpDesc>,
                   ops::ReduceSumDoubleOpGradMaker<paddle::imperative::OpBase>,
                   ops::ReduceSumGradNoNeedBufferVarInferer);
-
-template <typename T>
-using CPUReduceSumGradKernel =
-    ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, T,
-                             ops::SumGradFunctor, true>;
-
-REGISTER_OP_CPU_KERNEL(
-    reduce_sum_grad, CPUReduceSumGradKernel<bool>,
-    CPUReduceSumGradKernel<float>, CPUReduceSumGradKernel<double>,
-    CPUReduceSumGradKernel<paddle::platform::float16>,
-    CPUReduceSumGradKernel<int>, CPUReduceSumGradKernel<int64_t>,
-    CPUReduceSumGradKernel<paddle::platform::complex<float>>,
-    CPUReduceSumGradKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
deleted file mode 100644
index 2f6bf12751809..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
-
-template <typename T>
-using CUDAReduceSumGradKernel =
-    ops::ReduceCudaGradKernel<T, kps::IdentityFunctor>;
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_sum_grad, CUDAReduceSumGradKernel<bool>,
-    CUDAReduceSumGradKernel<float>, CUDAReduceSumGradKernel<double>,
-    CUDAReduceSumGradKernel<paddle::platform::float16>,
-    CUDAReduceSumGradKernel<paddle::platform::bfloat16>,
-    CUDAReduceSumGradKernel<int>, CUDAReduceSumGradKernel<int64_t>,
-    CUDAReduceSumGradKernel<paddle::platform::complex<float>>,
-    CUDAReduceSumGradKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/rnn_op.h b/paddle/fluid/operators/rnn_op.h
index b636184ae457e..a473b54c1f855 100644
--- a/paddle/fluid/operators/rnn_op.h
+++ b/paddle/fluid/operators/rnn_op.h
@@ -16,9 +16,9 @@ limitations under the License. */
 #include <type_traits>
 #include <vector>
 
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/operators/unique_op.h"
@@ -36,6 +36,14 @@ using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 using TensorList = std::vector<framework::Tensor>;
 
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
 #define DEFINE_MODE_DETECTOR(MODE_NAME, MODE_STR)                      \
   inline bool is_##MODE_NAME(const framework::ExecutionContext& ctx) { \
     const std::string& mode = ctx.Attr<std::string>("mode");           \
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index 5627b4f229e10..bf78b6a696559 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -9,9 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/roi_align_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,79 +26,6 @@ class ROIAlignOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::NotFound("Input(X) of ROIAlignOp "
-                                                 "is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true,
-                      platform::errors::NotFound("Input(ROIs) of ROIAlignOp "
-                                                 "is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::NotFound("Output(Out) of ROIAlignOp "
-                                                 "is not found."));
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-
-    if (ctx->HasInput("RoisNum")) {
-      auto rois_num_dims = ctx->GetInputDim("RoisNum");
-      PADDLE_ENFORCE_EQ(
-          rois_num_dims.size(), 1,
-          platform::errors::InvalidArgument("The size of RoisNum should be 1"
-                                            ", but received size = %d",
-                                            rois_num_dims.size()));
-    }
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(), 4,
-        platform::errors::InvalidArgument(
-            "The format of Input(X) in"
-            "RoIAlignOp is NCHW. And the rank of input must be 4. "
-            "But received rank = %d",
-            input_dims.size()));
-    PADDLE_ENFORCE_EQ(rois_dims.size(), 2, platform::errors::InvalidArgument(
-                                               "The rank of Input(ROIs) "
-                                               "in RoIAlignOp should be 2. "
-                                               "But the rank of RoIs is %d",
-                                               rois_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(rois_dims[1], 4,
-                        platform::errors::InvalidArgument(
-                            "The second dimension "
-                            "of Input(ROIs) should be 4. But received the "
-                            "dimension = %d",
-                            rois_dims[1]));
-    }
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-
-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      platform::errors::InvalidArgument(
-                          "The 'pooled_height' attribute in RoIAlignOp is "
-                          "invalid. The height must be greater than 0. But "
-                          "received 'pooled_height' = %d",
-                          pooled_height));
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      platform::errors::InvalidArgument(
-                          "The 'pooled_width' attribute in RoIAlignOp is "
-                          "invalid. The width must be greater than 0. But "
-                          "received 'pooled_width' = %d",
-                          pooled_width));
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      platform::errors::InvalidArgument(
-                          "The 'spatial_scale' attribute in RoIAlignOp is "
-                          "invalid. The scale must be greater than 0. But "
-                          "received 'spatial_scale' = %f",
-                          spatial_scale));
-
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] = input_dims[1];
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-
-    ctx->SetOutputDim("Out", out_dims);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -221,21 +151,16 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RoiAlignGradNoNeedBufVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(roi_align, RoiAlignInferShapeFunctor,
+                            PD_INFER_META(phi::RoiAlignInferMeta));
+
 REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker,
                   ops::ROIAlignGradMaker<paddle::framework::OpDesc>,
-                  ops::ROIAlignGradMaker<paddle::imperative::OpBase>);
+                  ops::ROIAlignGradMaker<paddle::imperative::OpBase>,
+                  RoiAlignInferShapeFunctor);
 REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp,
                   ops::RoiAlignGradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    roi_align,
-    ops::CPUROIAlignOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIAlignOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUROIAlignOpKernel<paddle::platform::CPUDeviceContext, int>);
-REGISTER_OP_CPU_KERNEL(
-    roi_align_grad,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, int>);
+
 REGISTER_OP_VERSION(roi_align)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
deleted file mode 100644
index 18941d10e937d..0000000000000
--- a/paddle/fluid/operators/roi_align_op.cu
+++ /dev/null
@@ -1,426 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/roi_align_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaxinumNumBlocks = 4096;
-static constexpr int kROISize = 4;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-
-template <class T>
-__device__ T BilinearInterpolate(const T* input_data, const int height,
-                                 const int width, T y, T x) {
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    return 0;
-  }
-  y = y <= 0 ? 0 : y;
-  x = x <= 0 ? 0 : x;
-  int y_low = static_cast<int>(y);
-  int x_low = static_cast<int>(x);
-  int y_high;
-  int x_high;
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = static_cast<T>(y_low);
-  } else {
-    y_high = y_low + 1;
-  }
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = static_cast<T>(x_low);
-  } else {
-    x_high = x_low + 1;
-  }
-  T ly = y - y_low, lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-
-  T v1 = input_data[y_low * width + x_low];
-  T v2 = input_data[y_low * width + x_high];
-  T v3 = input_data[y_high * width + x_low];
-  T v4 = input_data[y_high * width + x_high];
-  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-
-template <class T>
-__device__ void BilinearInterpolateGradient(const int height, const int width,
-                                            T y, T x, T* w1, T* w2, T* w3,
-                                            T* w4, int* x_low, int* x_high,
-                                            int* y_low, int* y_high) {
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    return;
-  }
-
-  y = y <= 0 ? 0 : y;
-  x = x <= 0 ? 0 : x;
-  *y_low = static_cast<int>(y);
-  *x_low = static_cast<int>(x);
-  if (*y_low >= height - 1) {
-    *y_high = *y_low = height - 1;
-    y = static_cast<T>(*y_low);
-  } else {
-    *y_high = *y_low + 1;
-  }
-  if (*x_low >= width - 1) {
-    *x_high = *x_low = width - 1;
-    x = static_cast<T>(*x_low);
-  } else {
-    *x_high = *x_low + 1;
-  }
-  T ly = y - *y_low, lx = x - *x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
-
-  return;
-}
-
-template <class T>
-__global__ void GPUROIAlignForward(
-    const int nthreads, const T* input_data, const T* input_rois,
-    const float spatial_scale, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int sampling_ratio, int* roi_batch_id_data, T* output_data,
-    const bool continuous_coordinate) {
-  CUDA_KERNEL_LOOP(i, nthreads) {
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % channels;
-    int n = i / pooled_width / pooled_height / channels;
-
-    const T* offset_input_rois = input_rois + n * kROISize;
-    int roi_batch_ind = roi_batch_id_data[n];
-
-    T roi_offset = continuous_coordinate ? static_cast<T>(0.5) : 0;
-    T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset;
-    T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset;
-    T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset;
-    T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset;
-
-    T roi_width = roi_xmax - roi_xmin;
-    T roi_height = roi_ymax - roi_ymin;
-    if (!continuous_coordinate) {
-      roi_width = max(roi_width, static_cast<T>(1.));
-      roi_height = max(roi_height, static_cast<T>(1.));
-    }
-
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    const T* offset_input_data =
-        input_data + (roi_batch_ind * channels + c) * height * width;
-
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
-    T output_val = 0;
-    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-      const T y = roi_ymin + ph * bin_size_h +
-                  static_cast<T>(iy + .5f) * bin_size_h /
-                      static_cast<T>(roi_bin_grid_h);
-      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const T x = roi_xmin + pw * bin_size_w +
-                    static_cast<T>(ix + .5f) * bin_size_w /
-                        static_cast<T>(roi_bin_grid_w);
-        T val = BilinearInterpolate(offset_input_data, height, width, y, x);
-        output_val += val;
-      }
-    }
-    output_val /= count;
-    output_data[i] = output_val;
-  }
-}
-
-template <typename T>
-__global__ void GPUROIAlignBackward(
-    const int nthreads, const T* input_rois, const T* out_grad,
-    const int num_rois, const float spatial_scale, const int channels,
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int sampling_ratio, int* roi_batch_id_data,
-    T* input_grad, const bool continuous_coordinate) {
-  CUDA_KERNEL_LOOP(i, nthreads) {
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % channels;
-    int n = i / pooled_width / pooled_height / channels;
-    const T* offset_input_rois = input_rois + n * kROISize;
-    int roi_batch_ind = roi_batch_id_data[n];
-
-    T roi_offset = continuous_coordinate ? T(0.5) : 0;
-    T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset;
-    T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset;
-    T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset;
-    T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset;
-
-    T roi_width = roi_xmax - roi_xmin;
-    T roi_height = roi_ymax - roi_ymin;
-    if (!continuous_coordinate) {
-      roi_width = max(roi_width, static_cast<T>(1.));
-      roi_height = max(roi_height, static_cast<T>(1.));
-    }
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    T* offset_input_grad =
-        input_grad + (roi_batch_ind * channels + c) * height * width;
-
-    const T* offset_out_grad =
-        out_grad + (n * channels + c) * pooled_height * pooled_width;
-    const T out_grad_this_bin = offset_out_grad[ph * pooled_width + pw];
-
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    const T count = roi_bin_grid_h * roi_bin_grid_w;
-    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-      const T y = roi_ymin + ph * bin_size_h +
-                  static_cast<T>(iy + .5f) * bin_size_h /
-                      static_cast<T>(roi_bin_grid_h);
-      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const T x = roi_xmin + pw * bin_size_w +
-                    static_cast<T>(ix + .5f) * bin_size_w /
-                        static_cast<T>(roi_bin_grid_w);
-        T w1 = 0, w2 = 0, w3 = 0, w4 = 0;
-        int x_low = -1, x_high = -1, y_low = -1, y_high = -1;
-        BilinearInterpolateGradient(height, width, y, x, &w1, &w2, &w3, &w4,
-                                    &x_low, &x_high, &y_low, &y_high);
-        T diff1 = out_grad_this_bin * w1 / count;
-        T diff2 = out_grad_this_bin * w2 / count;
-        T diff3 = out_grad_this_bin * w3 / count;
-        T diff4 = out_grad_this_bin * w4 / count;
-        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_low,
-                                  diff1);
-          platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_high,
-                                  diff2);
-          platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_low,
-                                  diff3);
-          platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_high,
-                                  diff4);
-        }
-      }
-    }
-  }
-}
-
-template <typename Place, typename T>
-class GPUROIAlignOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto aligned = ctx.Attr<bool>("aligned");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    int rois_num = rois->dims()[0];
-
-    if (rois_num == 0) return;
-
-    int output_size = out->numel();
-    int blocks = NumBlocks(output_size);
-    int threads = kNumCUDAThreads;
-#ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(ctx.cuda_device_context(), &threads, 256);
-#endif
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    auto cplace = platform::CPUPlace();
-    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto gplace = ctx.GetPlace();
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-      int rois_batch_size = rois_num_t->numel();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The rois_batch_size and imgs "
-              "batch_size must be the same. But received rois_batch_size = %d, "
-              "batch_size = %d",
-              rois_batch_size, batch_size));
-
-      std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(cplace, rois_num_list.data(), gplace,
-                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_list[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_list[n];
-      }
-    } else {
-      auto lod = rois->lod();
-      PADDLE_ENFORCE_EQ(
-          lod.empty(), false,
-          platform::errors::InvalidArgument("Input(ROIs) in ROIAlignOp does "
-                                            "not contain LoD information."));
-      auto rois_lod = lod.back();
-      int rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of rois and batch size "
-              "of images must be the same. But received rois batch size = %d, "
-              "and images batch size = %d",
-              rois_batch_size, batch_size));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num, rois_num_with_lod,
-          platform::errors::InvalidArgument(
-              "The actual number of rois and the number of rois "
-              "provided from Input(RoIsLoD) in RoIAlign must be the same."
-              " But received actual number of rois is %d, and the number "
-              "of rois from RoIsLoD is %d",
-              rois_num, rois_num_with_lod));
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-    int bytes = roi_batch_id_list.numel() * sizeof(int);
-    auto roi_ptr = memory::Alloc(dev_ctx, bytes);
-    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                 dev_ctx.stream());
-    GPUROIAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
-        height, width, pooled_height, pooled_width, sampling_ratio, roi_id_data,
-        out->mutable_data<T>(ctx.GetPlace()), aligned);
-  }
-};
-
-template <typename Place, typename T>
-class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-
-    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto aligned = ctx.Attr<bool>("aligned");
-
-    int rois_num = rois->dims()[0];
-    int channels = in->dims()[1];
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (!in_grad) {
-      return;
-    }
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    auto cplace = platform::CPUPlace();
-    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto gplace = ctx.GetPlace();
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-      int rois_batch_size = rois_num_t->numel();
-      std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(cplace, rois_num_list.data(), gplace,
-                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = start; i < start + rois_num_list[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_list[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-    auto roi_ptr =
-        memory::Alloc(dev_ctx, roi_batch_id_list.numel() * sizeof(int));
-    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    int bytes = roi_batch_id_list.numel() * sizeof(int);
-    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                 dev_ctx.stream());
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    phi::funcs::SetConstant<Place, T> set_zero;
-    set_zero(dev_ctx, in_grad, static_cast<T>(0));
-
-    int output_grad_size = out_grad->numel();
-    int blocks = NumBlocks(output_grad_size);
-    int threads = kNumCUDAThreads;
-
-    if (output_grad_size > 0) {
-      GPUROIAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-          output_grad_size, rois->data<T>(), out_grad->data<T>(), rois_num,
-          spatial_scale, channels, height, width, pooled_height, pooled_width,
-          sampling_ratio, roi_id_data, in_grad->mutable_data<T>(ctx.GetPlace()),
-          aligned);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    roi_align,
-    ops::GPUROIAlignOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIAlignOpKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    roi_align_grad,
-    ops::GPUROIAlignGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIAlignGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h
deleted file mode 100644
index e71099ed99f00..0000000000000
--- a/paddle/fluid/operators/roi_align_op.h
+++ /dev/null
@@ -1,465 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <limits>
-#include <numeric>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-namespace {  // NOLINT
-constexpr size_t get_offset(size_t x, size_t y, size_t width) {
-  return y * width + x;
-}
-
-template <class T>
-struct offsets_and_ratios {
-  offsets_and_ratios() = default;
-  offsets_and_ratios(std::size_t xy, std::size_t xY, std::size_t Xy,
-                     std::size_t XY, T xy_ratio, T xY_ratio, T Xy_ratio,
-                     T XY_ratio)
-      : xy(xy),
-        xY(xY),
-        Xy(Xy),
-        XY(XY),
-        xy_ratio(xy_ratio),
-        xY_ratio(xY_ratio),
-        Xy_ratio(Xy_ratio),
-        XY_ratio(XY_ratio) {}
-
-  std::size_t xy = 0;
-  std::size_t xY = 0;
-  std::size_t Xy = 0;
-  std::size_t XY = 0;
-  T xy_ratio = 0.0f;
-  T xY_ratio = 0.0f;
-  T Xy_ratio = 0.0f;
-  T XY_ratio = 0.0f;
-};
-
-template <typename T>
-std::vector<offsets_and_ratios<T>> get_indexes_and_ratios(
-    std::size_t width, std::size_t height, const T roi_width,
-    const T roi_height, const T roi_xmin, const T roi_ymin,
-    std::size_t pooled_width, std::size_t roi_bin_grid_w,
-    std::size_t pooled_height, std::size_t roi_bin_grid_h) {
-  const auto ind_num =
-      pooled_width * roi_bin_grid_w * pooled_height * roi_bin_grid_h;
-
-  std::vector<offsets_and_ratios<T>> interpolation_cords;
-  interpolation_cords.reserve(ind_num);
-
-  const auto bin_w = roi_width / pooled_width;
-  const auto bin_h = roi_height / pooled_height;
-
-  for (std::size_t py = 0; py < pooled_height; py++) {
-    for (std::size_t px = 0; px < pooled_width; px++) {
-      for (std::size_t iy = 0; iy < roi_bin_grid_h; iy++) {
-        // calculate x of sample points
-        auto y =
-            roi_ymin +
-            bin_h * (py +
-                     static_cast<T>(iy + .5f) / static_cast<T>(roi_bin_grid_h));
-        for (std::size_t ix = 0; ix < roi_bin_grid_w; ix++) {
-          // calculate x of sample points
-          auto x = roi_xmin +
-                   bin_w * (px +
-                            static_cast<T>(ix + .5f) /
-                                static_cast<T>(roi_bin_grid_w));
-
-          // deal with elements out of map
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            interpolation_cords.emplace_back();
-            continue;
-          }
-          y = y <= 0 ? 0 : y;
-          x = x <= 0 ? 0 : x;
-
-          std::size_t x_low_index = static_cast<std::size_t>(x);
-          std::size_t x_high_index;
-          if (x_low_index >= width - 1) {
-            x_high_index = x_low_index = width - 1;
-            x = static_cast<T>(x_low_index);
-          } else {
-            x_high_index = x_low_index + 1;
-          }
-          T x_ratio = x_high_index - x;
-
-          std::size_t y_low_index = static_cast<std::size_t>(y);
-          std::size_t y_high_index;
-          if (y_low_index >= height - 1) {
-            y_high_index = y_low_index = height - 1;
-            y = static_cast<T>(y_low_index);
-          } else {
-            y_high_index = y_low_index + 1;
-          }
-          T y_ratio = y_high_index - y;
-
-          auto xy = get_offset(x_low_index, y_low_index, width);
-          auto xY = get_offset(x_low_index, y_high_index, width);
-          auto Xy = get_offset(x_high_index, y_low_index, width);
-          auto XY = get_offset(x_high_index, y_high_index, width);
-
-          auto xy_ratio = x_ratio * y_ratio;
-          auto xY_ratio = x_ratio * (1 - y_ratio);
-          auto Xy_ratio = (1 - x_ratio) * y_ratio;
-          auto XY_ratio = (1 - x_ratio) * (1 - y_ratio);
-
-          interpolation_cords.emplace_back(xy, xY, Xy, XY, xy_ratio, xY_ratio,
-                                           Xy_ratio, XY_ratio);
-        }
-      }
-    }
-  }
-  return interpolation_cords;
-}  // namespace
-
-template <typename T>
-void interpolate(std::vector<T>& interpolated_values,  // NOLINT
-                 const std::vector<offsets_and_ratios<T>>& interpolation_cords,
-                 const T* data) {
-  for (auto& ic : interpolation_cords) {
-    auto xlyl_offset = ic.xy;
-    auto xhyl_offset = ic.Xy;
-    auto xlyh_offset = ic.xY;
-    auto xhyh_offset = ic.XY;
-
-    auto xlyl_ratio = ic.xy_ratio;
-    auto xhyl_ratio = ic.Xy_ratio;
-    auto xlyh_ratio = ic.xY_ratio;
-    auto xhyh_ratio = ic.XY_ratio;
-
-    interpolated_values.emplace_back(
-        xlyl_ratio * data[xlyl_offset] + xhyl_ratio * data[xhyl_offset] +
-        xlyh_ratio * data[xlyh_offset] + xhyh_ratio * data[xhyh_offset]);
-  }
-}
-
-template <typename T>
-void avg_pool(const std::vector<T>& interpolated_values, T* output_data,
-              int roi_bin_grid_w, int roi_bin_grid_h, int pooled_width,
-              int pooled_height) {
-  const auto data_amount = pooled_width * pooled_height;
-  const auto grid_points = roi_bin_grid_w * roi_bin_grid_h;
-  const T count = 1.0 / grid_points;
-  auto val_begin = interpolated_values.cbegin();
-  for (auto i = 0; i < data_amount; ++i) {
-    T sum = 0.0;
-    auto val_end = val_begin + grid_points;
-    sum = std::accumulate(val_begin, val_end, sum);
-    val_begin = val_end;
-    output_data[i] = sum * count;
-  }
-}
-}  // NOLINT
-
-template <class T>
-void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
-                                   const T out_grad_this_bin, const T count,
-                                   T* batch_grad_data) {
-  int x_low, y_low, x_high, y_high;
-  T w1, w2, w3, w4;
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    w1 = w2 = w3 = w4 = 0;
-    x_low = x_high = y_low = y_high = -1;
-    return;
-  }
-  y = y <= 0 ? 0 : y;
-  x = x <= 0 ? 0 : x;
-  y_low = static_cast<int>(y);
-  x_low = static_cast<int>(x);
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = static_cast<T>(y_low);
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = static_cast<T>(x_low);
-  } else {
-    x_high = x_low + 1;
-  }
-
-  T ly = y - y_low, lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-  T diff1 = out_grad_this_bin * w1 / count;
-  T diff2 = out_grad_this_bin * w2 / count;
-  T diff3 = out_grad_this_bin * w3 / count;
-  T diff4 = out_grad_this_bin * w4 / count;
-  if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-    *(batch_grad_data + y_low * width + x_low) += diff1;
-    *(batch_grad_data + y_low * width + x_high) += diff2;
-    *(batch_grad_data + y_high * width + x_low) += diff3;
-    *(batch_grad_data + y_high * width + x_high) += diff4;
-  }
-}
-
-template <typename DeviceContext, typename T>
-class CPUROIAlignOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto aligned = ctx.Attr<bool>("aligned");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    auto in_stride = phi::stride(in_dims);
-    auto roi_stride = phi::stride(rois->dims());
-    auto out_stride = phi::stride(out->dims());
-
-    const T* input_data = in->data<T>();
-    framework::Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-    int rois_batch_size;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of rois and the batch size of images "
-              " must be the same. But received the batch size of rois is %d, "
-              "and the batch size of images is %d",
-              rois_batch_size, batch_size));
-      auto* rois_num_data = rois_num_t->data<int>();
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_data[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_data[n];
-      }
-    } else {
-      auto lod = rois->lod();
-      PADDLE_ENFORCE_EQ(lod.empty(), false,
-                        platform::errors::InvalidArgument(
-                            "Input(ROIs) Tensor of ROIAlignOp "
-                            "does not contain LoD information."));
-      auto rois_lod = lod.back();
-      int rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The rois_batch_size and imgs "
-              "batch_size must be the same. But received rois_batch_size = %d, "
-              "batch_size = %d",
-              rois_batch_size, batch_size));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num, rois_num_with_lod,
-          platform::errors::InvalidArgument(
-              "The actual number of rois and the number of rois "
-              "provided from Input(RoIsLoD) in RoIAlign must be the same."
-              " But received actual number of rois is %d, and the number "
-              "of rois from RoIsLoD is %d",
-              rois_num, rois_num_with_lod));
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    const T* rois_data = rois->data<T>();
-    T roi_offset = aligned ? T(0.5) : 0;
-    for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_id = roi_batch_id_data[n];
-      T roi_xmin = rois_data[0] * spatial_scale - roi_offset;
-      T roi_ymin = rois_data[1] * spatial_scale - roi_offset;
-      T roi_xmax = rois_data[2] * spatial_scale - roi_offset;
-      T roi_ymax = rois_data[3] * spatial_scale - roi_offset;
-
-      T roi_width = roi_xmax - roi_xmin;
-      T roi_height = roi_ymax - roi_ymin;
-      if (!aligned) {
-        roi_width = std::max(roi_width, static_cast<T>(1.));
-        roi_height = std::max(roi_height, static_cast<T>(1.));
-      }
-
-      const T* batch_data = input_data + roi_batch_id * in_stride[0];
-
-      int roi_bin_grid_h = (sampling_ratio > 0)
-                               ? sampling_ratio
-                               : ceil(roi_height / pooled_height);
-      int roi_bin_grid_w = (sampling_ratio > 0)
-                               ? sampling_ratio
-                               : ceil(roi_width / pooled_width);
-
-      auto interpolation_cords = get_indexes_and_ratios(
-          width, height, roi_width, roi_height, roi_xmin, roi_ymin,
-          pooled_width, roi_bin_grid_w, pooled_height, roi_bin_grid_h);
-
-      std::vector<T> interpolated_values;
-      interpolated_values.reserve(interpolation_cords.size());
-      for (auto channel = 0; channel < channels; ++channel) {
-        interpolate(interpolated_values, interpolation_cords, batch_data);
-        avg_pool(interpolated_values, output_data, roi_bin_grid_w,
-                 roi_bin_grid_h, pooled_width, pooled_height);
-        batch_data += in_stride[1];
-        output_data += out_stride[1];
-        interpolated_values.clear();
-      }
-      rois_data += roi_stride[0];
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto in_dims = in->dims();
-    auto aligned = ctx.Attr<bool>("aligned");
-
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    if (!in_grad) {
-      return;
-    }
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-    int rois_batch_size;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      auto* rois_num_data = rois_num_t->data<int>();
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_data[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_data[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, in_grad, static_cast<T>(0));
-
-    int output_grad_size = out_grad->numel();
-
-    if ((!out_grad->IsInitialized()) || (output_grad_size <= 0)) {
-      return;
-    }
-
-    const T* rois_data = rois->data<T>();
-    const T* out_grad_data = out_grad->data<T>();
-    T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-
-    auto in_stride = phi::stride(in->dims());
-    auto roi_stride = phi::stride(rois->dims());
-    auto out_stride = phi::stride(out_grad->dims());
-
-    T roi_offset = aligned ? T(0.5) : 0;
-    for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_idx = roi_batch_id_data[n];
-      T roi_xmin = rois_data[0] * spatial_scale - roi_offset;
-      T roi_ymin = rois_data[1] * spatial_scale - roi_offset;
-      T roi_xmax = rois_data[2] * spatial_scale - roi_offset;
-      T roi_ymax = rois_data[3] * spatial_scale - roi_offset;
-
-      T roi_width = roi_xmax - roi_xmin;
-      T roi_height = roi_ymax - roi_ymin;
-      roi_width = std::max(roi_width, static_cast<T>(1.));
-      roi_height = std::max(roi_height, static_cast<T>(1.));
-      if (!aligned) {
-        roi_width = std::max(roi_width, static_cast<T>(1.));
-        roi_height = std::max(roi_height, static_cast<T>(1.));
-      }
-
-      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-      for (int c = 0; c < channels; ++c) {
-        T* batch_grad_data =
-            in_grad_data + roi_batch_idx * in_stride[0] + c * in_stride[1];
-        const T* batch_out_grad_data =
-            out_grad_data + n * out_stride[0] + c * out_stride[1];
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            int pool_index = ph * pooled_width + pw;
-            T out_grad_this_bin = batch_out_grad_data[pool_index];
-            int roi_bin_grid_h = (sampling_ratio > 0)
-                                     ? sampling_ratio
-                                     : ceil(roi_height / pooled_height);
-            int roi_bin_grid_w = (sampling_ratio > 0)
-                                     ? sampling_ratio
-                                     : ceil(roi_width / pooled_width);
-            T count = roi_bin_grid_h * roi_bin_grid_w;
-            for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-              const T y = roi_ymin + ph * bin_size_h +
-                          static_cast<T>(iy + .5f) * bin_size_h /
-                              static_cast<T>(roi_bin_grid_h);
-              for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-                const T x = roi_xmin + pw * bin_size_w +
-                            static_cast<T>(ix + .5f) * bin_size_w /
-                                static_cast<T>(roi_bin_grid_w);
-                bilinear_interpolate_gradient(height, width, y, x,
-                                              out_grad_this_bin, count,
-                                              batch_grad_data);
-              }
-            }
-          }
-        }
-      }
-      rois_data += roi_stride[0];
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/roi_align_op_npu.cc b/paddle/fluid/operators/roi_align_op_npu.cc
index d5b63854d9905..78509e4299b80 100644
--- a/paddle/fluid/operators/roi_align_op_npu.cc
+++ b/paddle/fluid/operators/roi_align_op_npu.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/roi_align_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc
index 09d2d906653e8..13490d6fcde3a 100644
--- a/paddle/fluid/operators/roi_align_op_xpu.cc
+++ b/paddle/fluid/operators/roi_align_op_xpu.cc
@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/roi_align_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
 template <typename DeviceContext, typename T>
 class XPUROIAlignOpKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index a512e7dcd682b..12e33d56c0020 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/roi_pool_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -26,74 +29,6 @@ class ROIPoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "roi_pool");
-    OP_INOUT_CHECK(ctx->HasInput("ROIs"), "Input", "ROIs", "roi_pool");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "roi_pool");
-    OP_INOUT_CHECK(ctx->HasOutput("Argmax"), "Output", "Argmax", "roi_pool");
-
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-
-    if (ctx->HasInput("RoisNum")) {
-      auto rois_num_dims = ctx->GetInputDim("RoisNum");
-      PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "The second dimension of RoisNum should "
-                            "be 1, but received dimension is %d",
-                            rois_num_dims.size()));
-    }
-    PADDLE_ENFORCE_EQ(input_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "The input data should be a four-dimensional "
-                          "tensor with [N,C,H,W], but received input data with "
-                          " %d dimension",
-                          input_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        rois_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D LoDTensor with shape (num_rois, 4)"
-            "given as [[x1, y1, x2, y2], ...], but received ROIs is "
-            "%d-dimensional LoDTensor",
-            rois_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        rois_dims[1], kROISize,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D LoDTensor with shape (num_rois, 4)"
-            "given as [[x1, y1, x2, y2], ...]. But the second dimension of  "
-            "the received data is %d",
-            rois_dims[1]));
-
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-
-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      platform::errors::OutOfRange(
-                          "The pooled output height must be greater than 0"
-                          "but received height is %d",
-                          pooled_height));
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      platform::errors::OutOfRange(
-                          "The pooled output width must be greater than 0"
-                          "but received width is %d",
-                          pooled_width));
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      platform::errors::OutOfRange(
-                          "The spatial scale must be greater than 0, "
-                          "but received spatial scale is %f",
-                          spatial_scale));
-
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] = input_dims[1];
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->SetOutputDim("Argmax", out_dims);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -212,20 +147,15 @@ class ROIPoolGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(roi_pool, RoiPoolInferShapeFunctor,
+                            PD_INFER_META(phi::RoiPoolInferMeta));
+
 REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
                   ops::ROIPoolGradMaker<paddle::framework::OpDesc>,
-                  ops::ROIPoolGradMaker<paddle::imperative::OpBase>);
+                  ops::ROIPoolGradMaker<paddle::imperative::OpBase>,
+                  RoiPoolInferShapeFunctor);
 REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp);
-REGISTER_OP_CPU_KERNEL(
-    roi_pool,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, int>);
-REGISTER_OP_CPU_KERNEL(
-    roi_pool_grad,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, int>);
+
 REGISTER_OP_VERSION(roi_pool)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
deleted file mode 100644
index b907b1114bbc0..0000000000000
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <vector>
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/roi_pool_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaxinumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-
-template <typename T>
-__global__ void GPUROIPoolForward(
-    const int nthreads, const T* input_data, const T* input_rois,
-    const float spatial_scale, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    int* roi_batch_id_data, T* output_data, int64_t* argmax_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % channels;
-    int n = i / pooled_width / pooled_height / channels;
-
-    const T* offset_input_rois = input_rois + n * kROISize;
-    int roi_batch_ind = roi_batch_id_data[n];
-    int roi_start_w = round(offset_input_rois[0] * spatial_scale);
-    int roi_start_h = round(offset_input_rois[1] * spatial_scale);
-    int roi_end_w = round(offset_input_rois[2] * spatial_scale);
-    int roi_end_h = round(offset_input_rois[3] * spatial_scale);
-
-    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
-    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
-
-    int hstart = static_cast<int>(floor(static_cast<double>(ph) *
-                                        static_cast<double>(roi_height) /
-                                        static_cast<double>(pooled_height)));
-    int wstart = static_cast<int>(floor(static_cast<double>(pw) *
-                                        static_cast<double>(roi_width) /
-                                        static_cast<double>(pooled_width)));
-    int hend = static_cast<int>(ceil(static_cast<double>(ph + 1) *
-                                     static_cast<double>(roi_height) /
-                                     static_cast<double>(pooled_height)));
-    int wend = static_cast<int>(ceil(static_cast<double>(pw + 1) *
-                                     static_cast<double>(roi_width) /
-                                     static_cast<double>(pooled_width)));
-    hstart = min(max(hstart + roi_start_h, 0), height);
-    hend = min(max(hend + roi_start_h, 0), height);
-    wstart = min(max(wstart + roi_start_w, 0), width);
-    wend = min(max(wend + roi_start_w, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
-    int maxidx = -1;
-    const T* offset_input_data =
-        input_data + (roi_batch_ind * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        int input_data_index = h * width + w;
-        if (offset_input_data[input_data_index] > maxval) {
-          maxval = offset_input_data[input_data_index];
-          maxidx = input_data_index;
-        }
-      }
-    }
-    output_data[i] = maxval;
-    if (argmax_data) {
-      argmax_data[i] = maxidx;
-    }
-  }
-}
-
-template <typename T>
-__global__ void GPUROIPoolBackward(
-    const int nthreads, const T* input_rois, const T* output_grad,
-    const int64_t* argmax_data, const int num_rois, const float spatial_scale,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, int* roi_batch_id_data,
-    T* input_grad) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % channels;
-    int n = i / pooled_width / pooled_height / channels;
-
-    int roi_batch_ind = roi_batch_id_data[n];
-    int input_offset = (roi_batch_ind * channels + c) * height * width;
-    int output_offset = (n * channels + c) * pooled_height * pooled_width;
-    const T* offset_output_grad = output_grad + output_offset;
-    T* offset_input_grad = input_grad + input_offset;
-    const int64_t* offset_argmax_data = argmax_data + output_offset;
-
-    int argmax = offset_argmax_data[ph * pooled_width + pw];
-    if (argmax != -1) {
-      platform::CudaAtomicAdd(
-          offset_input_grad + argmax,
-          static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
-    }
-  }
-}
-
-template <typename Place, typename T>
-class GPUROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto* argmax = ctx.Output<Tensor>("Argmax");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    auto in_stride = phi::stride(in_dims);
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    int rois_num = rois->dims()[0];
-
-    if (rois_num == 0) return;
-
-    int output_size = out->numel();
-    int blocks = NumBlocks(output_size);
-    int threads = kNumCUDAThreads;
-
-    framework::Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    auto cplace = platform::CPUPlace();
-    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto gplace = ctx.GetPlace();
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-      int rois_batch_size = rois_num_t->numel();
-
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of input(ROIs) and input(X) must be the same but "
-              "received batch size of input(ROIs) and input(X) is %d and %d "
-              "respectively.",
-              rois_batch_size, batch_size));
-      std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(cplace, rois_num_list.data(), gplace,
-                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_list[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_list[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of input(ROIs) and input(X) must be the same but "
-              "received batch size of input(ROIs) and input(X) is %d and %d "
-              "respectively.",
-              rois_batch_size, batch_size));
-
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                        platform::errors::InvalidArgument(
-                            "The number of rois from input(ROIs) and its LOD "
-                            "must be the same. Received rois %d of input(ROIs) "
-                            "but the number of rois %d from its LOD is %d",
-                            rois_num, rois_num_with_lod));
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-    int bytes = roi_batch_id_list.numel() * sizeof(int);
-    auto roi_ptr = memory::Alloc(dev_ctx, bytes);
-    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                 dev_ctx.stream());
-
-    GPUROIPoolForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
-        height, width, pooled_height, pooled_width, roi_id_data,
-        out->mutable_data<T>(ctx.GetPlace()),
-        argmax->mutable_data<int64_t>(ctx.GetPlace()));
-  }
-};
-
-template <typename Place, typename T>
-class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* rois_lod = ctx.Input<Tensor>("RoisNum");
-    auto* argmax = ctx.Input<Tensor>("Argmax");
-
-    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    int rois_num = rois->dims()[0];
-    int channels = in->dims()[1];
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (x_grad) {
-      framework::Tensor roi_batch_id_list;
-      roi_batch_id_list.Resize({rois_num});
-      auto cplace = platform::CPUPlace();
-      int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-
-      auto& dev_ctx = ctx.cuda_device_context();
-      auto gplace = ctx.GetPlace();
-      if (ctx.HasInput("RoisNum")) {
-        auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-        int rois_batch_size = rois_num_t->numel();
-        std::vector<int> rois_num_list(rois_batch_size);
-        memory::Copy(cplace, rois_num_list.data(), gplace,
-                     rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-        int start = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int i = start; i < start + rois_num_list[n]; ++i) {
-            roi_batch_id_data[i] = n;
-          }
-          start += rois_num_list[n];
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        int rois_batch_size = rois_lod.size() - 1;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            roi_batch_id_data[i] = n;
-          }
-        }
-      }
-      int bytes = roi_batch_id_list.numel() * sizeof(int);
-      auto roi_ptr = memory::Alloc(dev_ctx, bytes);
-      int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-      memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                   dev_ctx.stream());
-
-      x_grad->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<Place, T> set_zero;
-      set_zero(dev_ctx, x_grad, static_cast<T>(0));
-
-      int output_grad_size = out_grad->numel();
-      int blocks = NumBlocks(output_grad_size);
-      int threads = kNumCUDAThreads;
-
-      if (output_grad_size > 0) {
-        GPUROIPoolBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            output_grad_size, rois->data<T>(), out_grad->data<T>(),
-            argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
-            width, pooled_height, pooled_width, roi_id_data,
-            x_grad->mutable_data<T>(ctx.GetPlace()));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    roi_pool,
-    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    roi_pool_grad,
-    ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
deleted file mode 100644
index a104fd49eb3e0..0000000000000
--- a/paddle/fluid/operators/roi_pool_op.h
+++ /dev/null
@@ -1,250 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <limits>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-static constexpr int kROISize = 4;
-
-template <typename DeviceContext, typename T>
-class CPUROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* argmax = ctx.Output<framework::Tensor>("Argmax");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    auto in_stride = phi::stride(in_dims);
-    auto argmax_stride = phi::stride(argmax->dims());
-    auto roi_stride = phi::stride(rois->dims());
-    auto out_stride = phi::stride(out->dims());
-
-    const T* input_data = in->data<T>();
-
-    framework::Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-    int rois_batch_size;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument("The rois_batch_size and imgs "
-                                            "batch_size must be the same."));
-      auto* rois_num_data = rois_num_t->data<int>();
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_data[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_data[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument("The rois_batch_size and imgs "
-                                            "batch_size must be the same."));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num, rois_num_with_lod,
-          platform::errors::InvalidArgument("The rois_num from input "
-                                            "and lod must be the same."));
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
-
-    const T* rois_data = rois->data<T>();
-    for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_id = roi_batch_id_data[n];
-      int roi_start_w = round(rois_data[0] * spatial_scale);
-      int roi_start_h = round(rois_data[1] * spatial_scale);
-      int roi_end_w = round(rois_data[2] * spatial_scale);
-      int roi_end_h = round(rois_data[3] * spatial_scale);
-
-      // Force malformed ROIs to be 1x1
-      int roi_height = std::max(roi_end_h - roi_start_h + 1, 1);
-      int roi_width = std::max(roi_end_w - roi_start_w + 1, 1);
-
-      const float bin_size_h =
-          static_cast<float>(roi_height) / static_cast<float>(pooled_height);
-      const float bin_size_w =
-          static_cast<float>(roi_width) / static_cast<float>(pooled_width);
-
-      const T* batch_data = input_data + roi_batch_id * in_stride[0];
-
-      for (int c = 0; c < channels; ++c) {
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            //  Compute pooling region for this output unit:
-            //  start (included) = floor(ph * roi_height / pooled_height_)
-            //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
-            int hstart =
-                static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
-            int wstart =
-                static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
-            int hend =
-                static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
-            int wend =
-                static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));
-
-            hstart = std::min(std::max(hstart + roi_start_h, 0), height);
-            hend = std::min(std::max(hend + roi_start_h, 0), height);
-            wstart = std::min(std::max(wstart + roi_start_w, 0), width);
-            wend = std::min(std::max(wend + roi_start_w, 0), width);
-
-            const int pool_index = ph * pooled_width + pw;
-
-            // Define an empty pooling region to be zero
-            bool is_empty = (hend <= hstart) || (wend <= wstart);
-            output_data[pool_index] =
-                is_empty ? 0 : -std::numeric_limits<T>::max();
-            argmax_data[pool_index] = -1;
-
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                const int index = h * width + w;
-                if (batch_data[index] > output_data[pool_index]) {
-                  output_data[pool_index] = batch_data[index];
-                  argmax_data[pool_index] = index;
-                }
-              }
-            }
-          }
-        }
-
-        batch_data += in_stride[1];
-        output_data += out_stride[1];
-        argmax_data += argmax_stride[1];
-      }
-      // Increment ROI data pointer
-      rois_data += roi_stride[0];
-    }
-    return;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* argmax = ctx.Input<framework::Tensor>("Argmax");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-
-    if (in_grad) {
-      int rois_num = rois->dims()[0];
-      framework::Tensor roi_batch_id_list;
-      roi_batch_id_list.Resize({rois_num});
-      int* roi_batch_id_data =
-          roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-      int rois_batch_size;
-      if (ctx.HasInput("RoisNum")) {
-        auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-        rois_batch_size = rois_num_t->numel();
-        auto* rois_num_data = rois_num_t->data<int>();
-        int start = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int i = start; i < start + rois_num_data[n]; ++i) {
-            roi_batch_id_data[i] = n;
-          }
-          start += rois_num_data[n];
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        rois_batch_size = rois_lod.size() - 1;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            roi_batch_id_data[i] = n;
-          }
-        }
-      }
-
-      const T* rois_data = rois->data<T>();
-      const T* out_grad_data = out_grad->data<T>();
-      const int64_t* argmax_data = argmax->data<int64_t>();
-      T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.template device_context<DeviceContext>(), in_grad,
-               static_cast<T>(0));
-
-      auto in_stride = phi::stride(in->dims());
-      auto argmax_stride = phi::stride(argmax->dims());
-      auto roi_stride = phi::stride(rois->dims());
-      auto out_stride = phi::stride(out_grad->dims());
-
-      int channels = in->dims()[1];
-
-      for (int n = 0; n < rois_num; ++n) {
-        int roi_batch_idx = roi_batch_id_data[n];
-        T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0];
-        for (int c = 0; c < channels; ++c) {
-          for (int ph = 0; ph < pooled_height; ++ph) {
-            for (int pw = 0; pw < pooled_width; ++pw) {
-              int pool_index = ph * pooled_width + pw;
-              if (argmax_data[pool_index] >= 0) {
-                auto index = argmax_data[pool_index];
-                batch_grad_data[index] += out_grad_data[pool_index];
-              }
-            }
-          }
-          batch_grad_data += in_stride[1];
-          out_grad_data += out_stride[1];
-          argmax_data += argmax_stride[1];
-        }
-        rois_data += roi_stride[0];
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/roll_op.cc b/paddle/fluid/operators/roll_op.cc
index f82510556fde8..898db4c22fed9 100644
--- a/paddle/fluid/operators/roll_op.cc
+++ b/paddle/fluid/operators/roll_op.cc
@@ -12,13 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/roll_op.h"
-
 #include <memory>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -29,43 +32,6 @@ class RollOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of RollOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of RollOp should not be null."));
-
-    auto dims = ctx->Attrs().Get<std::vector<int64_t>>("axis");
-    auto shifts = ctx->Attrs().Get<std::vector<int64_t>>("shifts");
-
-    if (!ctx->HasInput("ShiftsTensor")) {
-      if (dims.size() != 0) {
-        PADDLE_ENFORCE_EQ(dims.size(), shifts.size(),
-                          platform::errors::InvalidArgument(
-                              "When dims.size() != 0, dims.size() "
-                              "should be equal to "
-                              "shifts.size(). But received "
-                              "dims.size() = %d, shifts.size() = %d",
-                              dims.size(), shifts.size()));
-      } else {
-        PADDLE_ENFORCE_EQ(shifts.size(), 1,
-                          platform::errors::InvalidArgument(
-                              "When dims.size() == 0, shifts.size() "
-                              "should be equal to 1, But received "
-                              "shifts.size() = %d",
-                              shifts.size()));
-      }
-    }
-
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    auto type = ctx->GetInputsVarType("X")[0];
-    if (type == framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -149,29 +115,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RollGradNoNeedBufferVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(roll, RollInferShapeFunctor,
+                            PD_INFER_META(phi::RollInferMeta));
+
 REGISTER_OPERATOR(roll, ops::RollOp, ops::RollOpMaker,
                   ops::RollGradMaker<paddle::framework::OpDesc>,
-                  ops::RollGradMaker<paddle::imperative::OpBase>);
+                  ops::RollGradMaker<paddle::imperative::OpBase>,
+                  RollInferShapeFunctor);
 REGISTER_OPERATOR(roll_grad, ops::RollGradOp,
                   ops::RollGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    roll, ops::RollKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    roll_grad, ops::RollGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<double>>);
 
 REGISTER_OP_VERSION(roll)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/roll_op.cu b/paddle/fluid/operators/roll_op.cu
deleted file mode 100644
index b9064c5450f9f..0000000000000
--- a/paddle/fluid/operators/roll_op.cu
+++ /dev/null
@@ -1,225 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/roll_op.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/core/utils/array.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T, size_t Rank>
-__global__ void RollCudaKernel(const T* input, T* output, int64_t N,
-                               phi::Array<int64_t, Rank> shifts,
-                               phi::Array<int64_t, Rank> strides,
-                               phi::Array<int64_t, Rank> sizes) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-
-  int64_t output_idx = idx;
-  int64_t new_dim_idx = 0;
-
-#pragma unroll
-  for (size_t i = 0; i < Rank; i++) {
-    new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i];
-    if (new_dim_idx >= sizes[i]) {
-      output_idx += (shifts[i] - sizes[i]) * strides[i];
-    } else {
-      output_idx += shifts[i] * strides[i];
-    }
-  }
-  output[output_idx] = input[idx];
-}
-
-template <typename T>
-class RollKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      PADDLE_ENFORCE_EQ(
-          shifts_tensor->dims().size(), 1,
-          platform::errors::InvalidArgument(
-              "The rank of ShiftsTensor is expected to be 1, got %s",
-              shifts_tensor->dims().size()));
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-
-    size_t nums = shifts.size();
-    auto input_dim = in->dims();
-    auto stride_dim = phi::stride(input_dim);
-
-    std::vector<int64_t> strides(nums), sizes(nums);
-    if (dims.size() == 0) {
-      strides[0] = 1;
-      sizes[0] = numel;
-      shifts[0] = (shifts[0] % numel + numel) % numel;
-    } else {
-      for (size_t i = 0; i < nums; i++) {
-        int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
-        int64_t size = input_dim[dim];
-
-        if (size != 0) {
-          shifts[i] = (shifts[i] % size + size) % size;
-          strides[i] = stride_dim[dim];
-          sizes[i] = size;
-        }
-      }
-    }
-
-#define CALL_ROLL_CUDA_KERNEL(N)                                               \
-  case N: {                                                                    \
-    phi::Array<int64_t, N> _strides;                                           \
-    phi::Array<int64_t, N> _shifts;                                            \
-    phi::Array<int64_t, N> _sizes;                                             \
-    for (size_t idx = 0; idx < N; ++idx) {                                     \
-      _strides[idx] = strides[idx];                                            \
-      _shifts[idx] = shifts[idx];                                              \
-      _sizes[idx] = sizes[idx];                                                \
-    }                                                                          \
-    RollCudaKernel<                                                            \
-        T,                                                                     \
-        N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,  \
-             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, numel,   \
-                                                   _shifts, _strides, _sizes); \
-    break;                                                                     \
-  }
-
-    switch (nums) {
-      CALL_ROLL_CUDA_KERNEL(1);
-      CALL_ROLL_CUDA_KERNEL(2);
-      CALL_ROLL_CUDA_KERNEL(3);
-      CALL_ROLL_CUDA_KERNEL(4);
-      CALL_ROLL_CUDA_KERNEL(5);
-      CALL_ROLL_CUDA_KERNEL(6);
-      CALL_ROLL_CUDA_KERNEL(7);
-      CALL_ROLL_CUDA_KERNEL(8);
-      CALL_ROLL_CUDA_KERNEL(9);
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "shifts.size() should be less than 10, But received shifts.size() "
-            "= %d",
-            shifts.size()));
-    }
-  }
-};
-
-template <typename T>
-class RollGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* out = context.Output<LoDTensor>(framework::GradVarName("X"));
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      PADDLE_ENFORCE_EQ(
-          shifts_tensor->dims().size(), 1,
-          platform::errors::InvalidArgument(
-              "The rank of ShiftsTensor is expected to be 1, got %s",
-              shifts_tensor->dims().size()));
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-    size_t nums = shifts.size();
-    auto input_dim = in->dims();
-    auto stride_dim = phi::stride(input_dim);
-
-    std::vector<int64_t> strides(nums), sizes(nums);
-    if (dims.size() == 0) {
-      strides[0] = 1;
-      sizes[0] = numel;
-      shifts[0] = ((-shifts[0]) % numel + numel) % numel;
-    } else {
-      for (size_t i = 0; i < nums; i++) {
-        int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
-        int64_t size = input_dim[dim];
-        if (size != 0) {
-          shifts[i] = ((-shifts[i]) % size + size) % size;
-          strides[i] = stride_dim[dim];
-          sizes[i] = size;
-        }
-      }
-    }
-
-    switch (nums) {
-      CALL_ROLL_CUDA_KERNEL(1);
-      CALL_ROLL_CUDA_KERNEL(2);
-      CALL_ROLL_CUDA_KERNEL(3);
-      CALL_ROLL_CUDA_KERNEL(4);
-      CALL_ROLL_CUDA_KERNEL(5);
-      CALL_ROLL_CUDA_KERNEL(6);
-      CALL_ROLL_CUDA_KERNEL(7);
-      CALL_ROLL_CUDA_KERNEL(8);
-      CALL_ROLL_CUDA_KERNEL(9);
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "shifts.size() should be less than 10, But received shifts.size() "
-            "= %d",
-            shifts.size()));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    roll, ops::RollKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    roll_grad, ops::RollGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/roll_op.h b/paddle/fluid/operators/roll_op.h
deleted file mode 100644
index 413c7bcfc15eb..0000000000000
--- a/paddle/fluid/operators/roll_op.h
+++ /dev/null
@@ -1,169 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DDim = framework::DDim;
-
-template <typename T>
-inline void shift_along_dim(T* data, const DDim& input_dim, int64_t dim,
-                            int64_t shift) {
-  if (dim < 0) {
-    dim += input_dim.size();
-  }
-  if (input_dim[dim] == 0) {
-    return;
-  }
-  shift = shift % input_dim[dim];
-  if (shift < 0) {
-    shift += input_dim[dim];
-  }
-
-  auto outer_loops = 1;
-  for (auto i = 0; i < dim; i++) {
-    outer_loops *= input_dim[i];
-  }
-  auto slice_width = 1;
-  for (auto i = dim + 1; i < input_dim.size(); i++) {
-    slice_width *= input_dim[i];
-  }
-
-  VLOG(3) << "shift_along_dim_debug: input_dim: " << input_dim
-          << "; dim: " << dim << "; shift: " << shift
-          << "; outer_loops: " << outer_loops
-          << "; slice_width: " << slice_width;
-  if (shift == 0) {
-    return;
-  }
-
-  std::vector<T> head;
-  auto head_size = slice_width * (input_dim[dim] - shift);
-  head.resize(head_size);
-
-  for (auto i = 0; i < outer_loops; i++) {
-    for (auto j = 0; j < head_size; j++) {
-      head[j] = data[i * input_dim[dim] * slice_width + j];
-    }
-    for (auto j = input_dim[dim] - shift; j < input_dim[dim]; j++) {
-      auto dst_pos = j - input_dim[dim] + shift;
-      for (auto k = 0; k < slice_width; k++) {
-        data[(i * input_dim[dim] + dst_pos) * slice_width + k] =
-            data[(i * input_dim[dim] + j) * slice_width + k];
-      }
-    }
-    for (auto j = 0; j < head_size; j++) {
-      data[(i * input_dim[dim] + shift) * slice_width + j] = head[j];
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class RollKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input_var = context.InputVar("X");
-    auto* output_var = context.OutputVar("Out");
-    auto& input = input_var->Get<LoDTensor>();
-    auto* output = output_var->GetMutable<LoDTensor>();
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      PADDLE_ENFORCE_EQ(
-          shifts_tensor->dims().size(), 1,
-          platform::errors::InvalidArgument(
-              "The rank of ShiftsTensor is expected to be 1, got %s",
-              shifts_tensor->dims().size()));
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-
-    std::vector<T> out_vec;
-    paddle::framework::TensorToVector(input, context.device_context(),
-                                      &out_vec);
-
-    size_t nums = shifts.size();
-    DDim input_dim = input.dims();
-
-    // axis = none, reshape to 1-D tensor
-    if (dims.size() == 0) {
-      dims.push_back(0l);
-      input_dim = framework::Dim<1>(out_vec.size());
-    }
-
-    for (size_t i = 0; i < nums; i++) {
-      PADDLE_ENFORCE_EQ(
-          dims[i] < input_dim.size() && dims[i] >= (0 - input_dim.size()), true,
-          platform::errors::OutOfRange(
-              "Attr(axis[%d]) is out of range, It's expected "
-              "to be in range of [-%d, %d]. But received Attr(axis[%d]) = %d.",
-              i, input_dim.size(), input_dim.size() - 1, i, dims[i]));
-      shift_along_dim(out_vec.data(), input_dim, dims[i], shifts[i]);
-    }
-    output->mutable_data<T>(context.GetPlace());
-    framework::TensorFromVector(out_vec, context.device_context(), output);
-    output->Resize(input.dims());
-  }
-};
-
-template <typename DeviceContext, typename T>
-class RollGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input_var = context.InputVar(framework::GradVarName("Out"));
-    auto* output_var = context.OutputVar(framework::GradVarName("X"));
-    auto& input = input_var->Get<LoDTensor>();
-    auto* output = output_var->GetMutable<LoDTensor>();
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-
-    std::vector<T> out_vec;
-    paddle::framework::TensorToVector(input, context.device_context(),
-                                      &out_vec);
-
-    size_t nums = shifts.size();
-    DDim input_dim = input.dims();
-
-    // axis = none, reshape to 1-D tensor
-    if (dims.size() == 0) {
-      dims.push_back(0l);
-      input_dim = framework::Dim<1>(out_vec.size());
-    }
-
-    for (size_t i = 0; i < nums; i++) {
-      shift_along_dim(out_vec.data(), input_dim, dims[i], 0 - shifts[i]);
-    }
-    output->mutable_data<T>(context.GetPlace());
-    framework::TensorFromVector(out_vec, context.device_context(), output);
-    output->Resize(input.dims());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc
index 815984ac307fd..d5ef95269b48a 100644
--- a/paddle/fluid/operators/scatter_op_npu.cc
+++ b/paddle/fluid/operators/scatter_op_npu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/operators/kron_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/searchsorted_op.cc b/paddle/fluid/operators/searchsorted_op.cc
index bbd5b9c4e7db9..3a6fdbaa2613d 100644
--- a/paddle/fluid/operators/searchsorted_op.cc
+++ b/paddle/fluid/operators/searchsorted_op.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/searchsorted_op.h"
-
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,60 +23,6 @@ namespace operators {
 class SearchSortedOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  static bool SearchsortedDimsMatchedBeforeLastDim(
-      const framework::DDim& sequences_dims,
-      const framework::DDim& values_dims) {
-    if (sequences_dims.size() != values_dims.size()) {
-      return false;
-    }
-    const auto& sequences_dims_size = sequences_dims.size();
-    for (int64_t dim = 0; dim < sequences_dims_size - 1; ++dim) {
-      if (sequences_dims[dim] != values_dims[dim]) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("SortedSequence"), "Input", "SortedSequence",
-                   "searchsorted");
-    OP_INOUT_CHECK(ctx->HasInput("Values"), "Input", "Values", "searchsorted");
-
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "searchsorted");
-
-    auto sequences_dims = ctx->GetInputDim("SortedSequence");
-    auto values_dims = ctx->GetInputDim("Values");
-    auto out_int32 = ctx->Attrs().Get<bool>("out_int32");
-
-    if (sequences_dims.size() != 1) {
-      PADDLE_ENFORCE_EQ(
-          SearchsortedDimsMatchedBeforeLastDim(sequences_dims, values_dims),
-          true,
-          platform::errors::Unavailable(
-              "The dimensions of sorted_sequence tensor ( %s ) and values "
-              "tensor ( %s ) can not match. Because the input sorted_sequence "
-              "tensor must be 1 dimension or the first N-1 dimensions of "
-              "sorted_sequence tensor and input values tensor must match. "
-              "Please input appropriate sorted_sequence and values again! ",
-              sequences_dims, values_dims));
-    }
-
-    if (out_int32) {
-      PADDLE_ENFORCE_LT(
-          sequences_dims[sequences_dims.size() - 1],
-          std::numeric_limits<int>::max(),
-          platform::errors::Unavailable(
-              "The size of sorted_sequence %d exceed the maximum limit d%. "
-              "Because the size of sorted_sequence should be less than the "
-              "output maximum value for int32 bit. Please set appropriate "
-              "sorted_sequence to meet this requirement! ",
-              sequences_dims[sequences_dims.size() - 1],
-              std::numeric_limits<int>::max()));
-    }
-
-    ctx->SetOutputDim("Out", values_dims);
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -116,11 +63,7 @@ class SearchSortedOpMaker : public framework::OpProtoAndCheckerMaker {
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(searchsorted, ops::SearchSortedOp, ops::SearchSortedOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    searchsorted,
-    ops::SearchSortedKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SearchSortedKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SearchSortedKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SearchSortedKernel<paddle::platform::CPUDeviceContext, int64_t>);
+DECLARE_INFER_SHAPE_FUNCTOR(searchsorted, SearchsortedInferShapeFunctor,
+                            PD_INFER_META(phi::SearchsortedInferMeta));
+REGISTER_OPERATOR(searchsorted, ops::SearchSortedOp, ops::SearchSortedOpMaker,
+                  SearchsortedInferShapeFunctor);
diff --git a/paddle/fluid/operators/segment_pool_op.cc b/paddle/fluid/operators/segment_pool_op.cc
index 322cd97f01c3a..9d4c8532a82c0 100644
--- a/paddle/fluid/operators/segment_pool_op.cc
+++ b/paddle/fluid/operators/segment_pool_op.cc
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/segment_pool_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,22 +26,6 @@ class SegmentPoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SegmentPool");
-    OP_INOUT_CHECK(ctx->HasInput("SegmentIds"), "Input", "SegmentIds",
-                   "SegmentPool");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SegmentPool");
-    auto dims = ctx->GetInputDim("X");
-    dims[0] = -1;
-    ctx->SetOutputDim("Out", dims);
-
-    if (ctx->Attrs().Get<std::string>("pooltype") == "MEAN") {
-      OP_INOUT_CHECK(ctx->HasOutput("SummedIds"), "Output", "SummedIds",
-                     "SegmentPool");
-      ctx->SetOutputDim("SummedIds", {-1, 1});
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -150,17 +137,11 @@ class SegmentPoolGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(segment_pool, SegmentPoolInferShapeFunctor,
+                            PD_INFER_META(phi::SegmentPoolInferMeta));
+
 REGISTER_OPERATOR(segment_pool, ops::SegmentPoolOp, ops::SegmentPoolOpMaker,
                   ops::SegmentPoolGradOpMaker<paddle::framework::OpDesc>,
-                  ops::SegmentPoolGradOpMaker<paddle::imperative::OpBase>);
+                  ops::SegmentPoolGradOpMaker<paddle::imperative::OpBase>,
+                  SegmentPoolInferShapeFunctor);
 REGISTER_OPERATOR(segment_pool_grad, ops::SegmentPoolGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    segment_pool,
-    ops::SegmentPoolKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SegmentPoolKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    segment_pool_grad,
-    ops::SegmentPoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SegmentPoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu
deleted file mode 100644
index e147e62a98354..0000000000000
--- a/paddle/fluid/operators/segment_pool_op.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/segment_pool_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    segment_pool,
-    ops::SegmentPoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SegmentPoolKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    segment_pool_grad,
-    ops::SegmentPoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SegmentPoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h
deleted file mode 100644
index 2f5ef7f54f988..0000000000000
--- a/paddle/fluid/operators/segment_pool_op.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/segment_pooling.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T, typename IndexT>
-void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) {
-  auto* input = context.Input<Tensor>("X");
-  auto* segment = context.Input<Tensor>("SegmentIds");
-  auto* output = context.Output<Tensor>("Out");
-  std::string pooltype = context.Attr<std::string>("pooltype");
-  Tensor* summed_ids = nullptr;
-
-  int64_t num_indices = segment->numel();
-  PADDLE_ENFORCE_EQ(
-      num_indices, input->dims()[0],
-      platform::errors::InvalidArgument(
-          "Segment_ids should be the same size as dimension 0 of input X."));
-  PADDLE_ENFORCE_EQ(num_indices, segment->dims()[0],
-                    platform::errors::InvalidArgument(
-                        "Segment_ids should be 1-D tensor, or it's other "
-                        "dimension size is 1. Segment_ids's shape is: [%s].",
-                        segment->dims()));
-
-  if (input->numel() == 0 || segment->numel() == 0) {
-    return;
-  }
-
-  bool cpu_place = context.GetPlace().GetType() == phi::AllocationType::CPU;
-  if (cpu_place) {
-    auto dims = input->dims();
-    auto* segment_ids = segment->data<IndexT>();
-    dims[0] = static_cast<int64_t>(segment_ids[segment->numel() - 1] + 1);
-    PADDLE_ENFORCE_GT(
-        dims[0], 0,
-        platform::errors::InvalidArgument(
-            "Segment ids must be >= 0, but got last id %d", dims[0]));
-    output->Resize({dims});
-    output->mutable_data<T>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    set_zero(dev_ctx, output, static_cast<T>(0));
-  }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (!cpu_place) {
-    Tensor length;
-    length.mutable_data<IndexT>(phi::make_ddim({1}), platform::CPUPlace());
-    IndexT* length_data = length.data<IndexT>();
-    const IndexT* segment_ids = segment->data<IndexT>();
-
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        hipMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT),
-                  hipMemcpyDeviceToHost));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT),
-                   cudaMemcpyDeviceToHost));
-#endif
-
-    IndexT length_host = length_data[0];
-    length_host++;
-    PADDLE_ENFORCE_GT(
-        length_host, 0,
-        platform::errors::InvalidArgument(
-            "Segment ids must be >= 0, but got last id %d", length_data[0]));
-    auto dims = input->dims();
-    dims[0] = static_cast<int64_t>(length_host);
-    output->Resize({dims});
-    output->mutable_data<T>(context.GetPlace());
-    T init_value = 0;
-    if (pooltype == "MAX") {
-      init_value = static_cast<T>(-FLT_MAX);
-    } else if (pooltype == "MIN") {
-      init_value = static_cast<T>(FLT_MAX);
-    }
-    phi::funcs::SetConstant<DeviceContext, T> setconst;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    setconst(dev_ctx, output, static_cast<T>(init_value));
-    // the gpu kernel of mean pool record the counts of segment_ids
-    if (pooltype == "MEAN") {
-      summed_ids = context.Output<Tensor>("SummedIds");
-      summed_ids->Resize({dims[0], 1});
-      summed_ids->mutable_data<T>(context.GetPlace());
-      setconst(dev_ctx, summed_ids, static_cast<T>(1e-12));
-    }
-  }
-#endif
-
-  SegmentPoolFunctor<DeviceContext, T, IndexT> pool;
-
-  pool(context.template device_context<DeviceContext>(), *input, *segment,
-       output, summed_ids, pooltype);
-}
-
-template <typename DeviceContext, typename T>
-class SegmentPoolKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* segment = context.Input<Tensor>("SegmentIds");
-    auto index_type = framework::TransToProtoVarType(segment->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      SegmentKernelLaunchHelper<DeviceContext, T, int>(context);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      SegmentKernelLaunchHelper<DeviceContext, T, int64_t>(context);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported index type, Expected int, int64, but got %s.",
-          index_type));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SegmentPoolGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Input<Tensor>("Out");
-    auto* segment = context.Input<Tensor>("SegmentIds");
-    auto* out_g = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* in_g = context.Output<Tensor>(framework::GradVarName("X"));
-    std::string pooltype = context.Attr<std::string>("pooltype");
-
-    const Tensor* summed_ids = nullptr;
-    if (pooltype == "MEAN") {
-      summed_ids = context.Input<Tensor>("SummedIds");
-    }
-
-    in_g->mutable_data<T>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    set_zero(dev_ctx, in_g, static_cast<T>(0));
-
-    auto index_type = framework::TransToProtoVarType(segment->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      SegmentPoolGradFunctor<DeviceContext, T, int> pool;
-      pool(context.template device_context<DeviceContext>(), *input, *output,
-           *out_g, *segment, in_g, summed_ids, pooltype);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      SegmentPoolGradFunctor<DeviceContext, T, int64_t> pool;
-      pool(context.template device_context<DeviceContext>(), *input, *output,
-           *out_g, *segment, in_g, summed_ids, pooltype);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported index type, Expected int, int64, but got %s.",
-          index_type));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index 7d0d782b837c4..73655bcb18500 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -13,9 +13,15 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/set_value_op.h"
+
 #include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace framework {
 class InferShapeContext;
@@ -34,6 +40,8 @@ class CPUDeviceContext;
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 class SetValue : public framework::OperatorWithKernel {
  public:
   SetValue(const std::string &type, const framework::VariableNameMap &inputs,
@@ -41,17 +49,6 @@ class SetValue : public framework::OperatorWithKernel {
            const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "SetValue");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SetValue");
-    auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_LT(
-        in_dims.size(), 7,
-        platform::errors::InvalidArgument(
-            "The rank of input should be less than 7, but received %d.",
-            in_dims.size()));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -236,21 +233,16 @@ DECLARE_INPLACE_OP_INFERER(SetValueOpInplaceInferer, {"Input", "Out"});
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+DECLARE_INFER_SHAPE_FUNCTOR(set_value, SetValueInferShapeFunctor,
+                            PD_INFER_META(phi::SetValueInferMeta));
+
 REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker,
                   ops::SetValueGradMaker<paddle::framework::OpDesc>,
                   ops::SetValueGradMaker<paddle::imperative::OpBase>,
-                  ops::SetValueOpInplaceInferer);
+                  ops::SetValueOpInplaceInferer, SetValueInferShapeFunctor);
 
 REGISTER_OPERATOR(set_value_grad, ops::SetValueGrad);
 
-REGISTER_OP_CPU_KERNEL(
-    set_value_grad,
-    ops::SetValueGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SetValueGradKernel<plat::CPUDeviceContext, int64_t>,
-    ops::SetValueGradKernel<plat::CPUDeviceContext, float>,
-    ops::SetValueGradKernel<plat::CPUDeviceContext, double>,
-    ops::SetValueGradKernel<plat::CPUDeviceContext, bool>);
-
 REGISTER_OP_VERSION(set_value)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/set_value_op.cu b/paddle/fluid/operators/set_value_op.cu
deleted file mode 100644
index 9f291a863c067..0000000000000
--- a/paddle/fluid/operators/set_value_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/set_value_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    set_value_grad,
-    ops::SetValueGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SetValueGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SetValueGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SetValueGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SetValueGradKernel<paddle::platform::CUDADeviceContext, bool>);
diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h
index 4d459f8c01b15..4696907f32e6d 100644
--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
@@ -19,14 +19,10 @@
 #include <vector>
 
 #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/assign_value_op.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/slice_utils.h"
-#include "paddle/fluid/operators/strided_slice_op.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -36,23 +32,6 @@ namespace operators {
 using Tensor = framework::Tensor;
 using DDim = framework::DDim;
 
-inline void GetOffsets(const DDim& big_dim, const DDim& small_dim,
-                       DDim start_offset, int cur_dim,
-                       std::vector<DDim>* offsets) {
-  if (cur_dim == big_dim.size()) {
-    offsets->push_back(start_offset);
-    return;
-  }
-  if (small_dim[cur_dim] == big_dim[cur_dim]) {
-    GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets);
-  } else {
-    for (int i = 0; i < big_dim[cur_dim]; i++) {
-      GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets);
-      start_offset[cur_dim] += 1;
-    }
-  }
-}
-
 inline std::string GetValueName(framework::proto::VarType::Type data_type) {
   std::string value_name;
   switch (data_type) {
@@ -121,253 +100,6 @@ inline void CheckIsDimsMatch(const framework::DDim first,
       "of target shape: %d, but now shape is %d.",
       second.to_str(), first.to_str()));
 }
-template <typename DeviceContext, typename T>
-class SetValueGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    int rank = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
-
-    switch (rank) {
-      case 1:
-        SetValueGradCompute<1>(ctx);
-        break;
-      case 2:
-        SetValueGradCompute<2>(ctx);
-        break;
-      case 3:
-        SetValueGradCompute<3>(ctx);
-        break;
-      case 4:
-        SetValueGradCompute<4>(ctx);
-        break;
-      case 5:
-        SetValueGradCompute<5>(ctx);
-        break;
-      case 6:
-        SetValueGradCompute<6>(ctx);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The rank of set_value_grad's input should be less than 7, but "
-            "received %d.",
-            rank));
-    }
-  }
-
- private:
-  template <size_t D>
-  void SetValueGradCompute(const framework::ExecutionContext& context) const {
-    auto starts = context.Attr<std::vector<int64_t>>("starts");
-    auto ends = context.Attr<std::vector<int64_t>>("ends");
-    auto steps = context.Attr<std::vector<int64_t>>("steps");
-
-    auto axes_int64 = context.Attr<std::vector<int64_t>>("axes");
-    std::vector<int> axes(axes_int64.begin(), axes_int64.end());
-
-    auto starts_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto ends_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto steps_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto reverse_axis = Eigen::array<bool, D>();
-
-    auto list_new_ends_tensor =
-        context.MultiInput<framework::Tensor>("EndsTensorList");
-    auto list_new_starts_tensor =
-        context.MultiInput<framework::Tensor>("StartsTensorList");
-    auto list_new_steps_tensor =
-        context.MultiInput<framework::Tensor>("StepsTensorList");
-
-    if (list_new_starts_tensor.size() > 0) {
-      starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
-    }
-
-    if (list_new_ends_tensor.size() > 0) {
-      ends = GetDataFromTensorList<int64_t>(list_new_ends_tensor);
-    }
-
-    if (list_new_steps_tensor.size() > 0) {
-      steps = GetDataFromTensorList<int64_t>(list_new_steps_tensor);
-    }
-
-    auto in = context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(
-        in->IsInitialized(), true,
-        platform::errors::PermissionDenied(
-            "The input of `set_value_grad`(%s) has not been initialized",
-            framework::GradVarName("Out")));
-    auto grad_value = context.Output<framework::Tensor>(
-        framework::GradVarName("ValueTensor"));
-    auto grad_input =
-        context.Output<framework::Tensor>(framework::GradVarName("Input"));
-    auto in_dims = in->dims();
-
-    auto decrease_axis_int64 =
-        context.Attr<std::vector<int64_t>>("decrease_axes");
-    std::vector<int> decrease_axis(decrease_axis_int64.begin(),
-                                   decrease_axis_int64.end());
-    std::vector<int> infer_flags(axes.size(), 1);
-    std::vector<int64_t> out_dims_vector(in_dims.size(), -1);
-    StridedSliceOutDims(starts, ends, steps, axes, infer_flags, in_dims,
-                        decrease_axis, out_dims_vector.data(), axes.size(),
-                        false);
-
-    framework::DDim out_dims(phi::make_ddim(out_dims_vector));
-
-    std::vector<int> reverse_vector(starts.size(), 0);
-    StridedSliceFunctor(starts.data(), ends.data(), steps.data(), axes.data(),
-                        reverse_vector.data(), in_dims, infer_flags,
-                        decrease_axis, starts.size());
-
-    for (size_t axis = 0; axis < D; axis++) {
-      starts_indices[axis] = 0;
-      ends_indices[axis] = out_dims[axis];
-      steps_indices[axis] = 1;
-      reverse_axis[axis] = false;
-    }
-
-    for (size_t axis = 0; axis < axes.size(); axis++) {
-      int axis_index = axes[axis];
-      starts_indices[axis_index] = starts[axis];
-      ends_indices[axis_index] = ends[axis];
-      steps_indices[axis_index] = steps[axis];
-      reverse_axis[axis_index] = (reverse_vector[axis] == 1) ? true : false;
-    }
-
-    bool need_reverse = false;
-    for (size_t axis = 0; axis < axes.size(); axis++) {
-      if (reverse_vector[axis] == 1) {
-        need_reverse = true;
-        break;
-      }
-    }
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-
-    if (grad_input) {
-      // Set gradient of `Input`
-      paddle::framework::TensorCopy(*in, context.GetPlace(), grad_input);
-
-      auto grad_input_t =
-          framework::EigenTensor<T, D, Eigen::RowMajor,
-                                 Eigen::DenseIndex>::From(*grad_input);
-
-      framework::Tensor tmp(grad_input->dtype());
-      tmp.mutable_data<T>(out_dims, context.GetPlace());
-      set_zero(dev_ctx, &tmp, static_cast<T>(0));
-      auto tmp_t = framework::EigenTensor<T, D, Eigen::RowMajor,
-                                          Eigen::DenseIndex>::From(tmp);
-
-      grad_input_t.stridedSlice(starts_indices, ends_indices, steps_indices)
-          .device(place) = tmp_t;
-    }
-    if (grad_value) {
-      grad_value->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, grad_value, static_cast<T>(0));
-
-      auto in_t = framework::EigenTensor<T, D, Eigen::RowMajor,
-                                         Eigen::DenseIndex>::From(*in);
-
-      if (grad_value->dims() == out_dims) {
-        auto grad_value_t =
-            framework::EigenTensor<T, D, Eigen::RowMajor,
-                                   Eigen::DenseIndex>::From(*grad_value);
-        if (need_reverse) {
-          framework::Tensor tmp(grad_value->dtype());
-          tmp.mutable_data<T>(out_dims, context.GetPlace());
-          set_zero(dev_ctx, &tmp, static_cast<T>(0));
-          auto tmp_t = framework::EigenTensor<T, D, Eigen::RowMajor,
-                                              Eigen::DenseIndex>::From(tmp);
-
-          tmp_t.device(place) =
-              in_t.stridedSlice(starts_indices, ends_indices, steps_indices);
-          grad_value_t.device(place) = tmp_t.reverse(reverse_axis);
-        } else {
-          grad_value_t.device(place) =
-              in_t.stridedSlice(starts_indices, ends_indices, steps_indices);
-        }
-      } else {
-        int out_dims_size = out_dims.size();
-        auto grad_value_dims = grad_value->dims();
-        auto fake_grad_value_dims = out_dims;
-
-        // Create an extented shape according to the rules of broadcast.
-        auto grad_value_dims_size = grad_value_dims.size();
-
-        int num_decrease = 0;
-
-        int decrease_axis_size = decrease_axis.size();
-        for (int i = 0; i < out_dims_size; i++) {
-          if (decrease_axis.end() !=
-              std::find(decrease_axis.begin(), decrease_axis.end(), i)) {
-            fake_grad_value_dims[i] = 1;
-            num_decrease++;
-          } else if (i < out_dims_size - (grad_value_dims_size +
-                                          decrease_axis_size - num_decrease)) {
-            fake_grad_value_dims[i] = 1;
-          } else {
-            auto index_grad =
-                i - (out_dims_size - (grad_value_dims_size +
-                                      decrease_axis_size - num_decrease));
-            fake_grad_value_dims[i] = grad_value_dims[index_grad];
-
-            PADDLE_ENFORCE_EQ((out_dims[i] == grad_value_dims[index_grad]) ||
-                                  (grad_value_dims[index_grad] == 1),
-                              true,
-                              platform::errors::InvalidArgument(
-                                  "An error occurred while calculating %s: "
-                                  "[%s] can not be accumulated into [%s].",
-                                  framework::GradVarName("ValueTensor"),
-                                  out_dims, grad_value_dims));
-          }
-        }
-
-        VLOG(3) << "Dimensions of " << framework::GradVarName("ValueTensor")
-                << "([" << grad_value_dims << "])is broadcasted into ["
-                << fake_grad_value_dims << "].";
-
-        auto extent = Eigen::DSizes<Eigen::DenseIndex, D>();
-        auto offset = out_dims;
-        for (int i = 0; i < out_dims_size; i++) {
-          offset[i] = 0;
-          extent[i] = fake_grad_value_dims[i];
-        }
-        std::vector<DDim> offsets;
-        GetOffsets(out_dims, fake_grad_value_dims, offset, 0, &offsets);
-
-        auto grad_value_t =
-            framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::
-                From(*grad_value, fake_grad_value_dims);
-
-        framework::Tensor tmp(grad_value->dtype());
-        tmp.mutable_data<T>(out_dims, context.GetPlace());
-        set_zero(dev_ctx, &tmp, static_cast<T>(0));
-        auto tmp_t = framework::EigenTensor<T, D, Eigen::RowMajor,
-                                            Eigen::DenseIndex>::From(tmp);
-
-        tmp_t.device(place) =
-            in_t.stridedSlice(starts_indices, ends_indices, steps_indices);
-
-        // accumulate gradient
-        for (auto offset : offsets) {
-          grad_value_t.device(place) =
-              grad_value_t +
-              tmp_t.slice(framework::EigenDim<D>::From(offset), extent);
-        }
-        if (need_reverse) {
-          framework::Tensor tmp_value(grad_value->dtype());
-          tmp_value.mutable_data<T>(fake_grad_value_dims, context.GetPlace());
-          auto tmp_value_t =
-              framework::EigenTensor<T, D, Eigen::RowMajor,
-                                     Eigen::DenseIndex>::From(tmp_value);
-          tmp_value_t.device(place) = grad_value_t.reverse(reverse_axis);
-          grad_value_t.device(place) = tmp_value_t;
-        }
-      }
-    }
-  }
-};
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc
index 599697059c4dc..46d64333b608b 100644
--- a/paddle/fluid/operators/set_value_op_npu.cc
+++ b/paddle/fluid/operators/set_value_op_npu.cc
@@ -174,6 +174,9 @@ class SetValueNPUKernel : public framework::OpKernel<T> {
         .AddInput(std::move(index_indices))
         .AddInput(val_temp)
         .AddOutput(out_temp)
+#if (CANN_VERSION_CODE >= 504001)
+        .AddAttrs({{"use_locking", false}})
+#endif
         .Run(stream);
   }
 };
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index 5b7ccdde81097..9001ce5d51dec 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/shape_op.h"
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -24,17 +25,6 @@ class ShapeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true,
-                      platform::errors::InvalidArgument(
-                          "Input (Input) of get_shape op should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output (Out) of get_shape op should not be null."));
-    auto in_dim = ctx->GetInputDim("Input");
-    ctx->SetOutputDim("Out", {in_dim.size()});
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
@@ -91,13 +81,12 @@ Return the shape of the input.
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+
+DECLARE_INFER_SHAPE_FUNCTOR(shape, ShapeInferShapeFunctor,
+                            PD_INFER_META(phi::ShapeInferMeta));
+
 REGISTER_OPERATOR(
     shape, ops::ShapeOp, ops::ShapeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<bool>, ops::ShapeKernel<int>,
-                       ops::ShapeKernel<int8_t>, ops::ShapeKernel<uint8_t>,
-                       ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
-                       ops::ShapeKernel<double>,
-                       ops::ShapeKernel<plat::complex<float>>,
-                       ops::ShapeKernel<plat::complex<double>>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ShapeInferShapeFunctor);
diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu
deleted file mode 100644
index c6e380a94f84d..0000000000000
--- a/paddle/fluid/operators/shape_op.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/shape_op.h"
-#include "paddle/fluid/platform/complex.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    shape, paddle::operators::ShapeKernel<bool>,
-    paddle::operators::ShapeKernel<int>, paddle::operators::ShapeKernel<int8_t>,
-    paddle::operators::ShapeKernel<uint8_t>,
-    paddle::operators::ShapeKernel<int64_t>,
-    paddle::operators::ShapeKernel<float>,
-    paddle::operators::ShapeKernel<double>,
-    paddle::operators::ShapeKernel<paddle::platform::float16>,
-    paddle::operators::ShapeKernel<paddle::platform::complex<float>>,
-    paddle::operators::ShapeKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h
deleted file mode 100644
index 39ebcca46a710..0000000000000
--- a/paddle/fluid/operators/shape_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = phi::SelectedRows;
-
-template <typename T>
-class ShapeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_var = ctx.InputVar("Input");
-    framework::DDim in_dims;
-    if (in_var->IsType<phi::SelectedRows>()) {
-      in_dims = in_var->Get<phi::SelectedRows>().value().dims();
-    } else {
-      in_dims = in_var->Get<LoDTensor>().dims();
-    }
-    auto* out_t = ctx.Output<Tensor>("Out");
-    out_t->Resize({in_dims.size()});
-    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
-    for (int i = 0; i < in_dims.size(); ++i) {
-      out_data[i] = in_dims[i];
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc
index 7bff7b2d66834..f751ab41014c2 100644
--- a/paddle/fluid/operators/shape_op_npu.cc
+++ b/paddle/fluid/operators/shape_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/shape_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/shape_op_xpu.cc b/paddle/fluid/operators/shape_op_xpu.cc
index 2e9092a643253..a62d1b434e764 100644
--- a/paddle/fluid/operators/shape_op_xpu.cc
+++ b/paddle/fluid/operators/shape_op_xpu.cc
@@ -10,12 +10,41 @@
  *     limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
 
-#include "paddle/fluid/operators/shape_op.h"
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = phi::SelectedRows;
+
+template <typename T>
+class ShapeXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_var = ctx.InputVar("Input");
+    framework::DDim in_dims;
+    if (in_var->IsType<phi::SelectedRows>()) {
+      in_dims = in_var->Get<phi::SelectedRows>().value().dims();
+    } else {
+      in_dims = in_var->Get<LoDTensor>().dims();
+    }
+    auto* out_t = ctx.Output<Tensor>("Out");
+    out_t->Resize({in_dims.size()});
+    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
+    for (int i = 0; i < in_dims.size(); ++i) {
+      out_data[i] = in_dims[i];
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(shape, ops::ShapeKernel<bool>, ops::ShapeKernel<int>,
-                       ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
-                       ops::ShapeKernel<double>);
+REGISTER_OP_XPU_KERNEL(shape, ops::ShapeXPUKernel<bool>,
+                       ops::ShapeXPUKernel<int>, ops::ShapeXPUKernel<int64_t>,
+                       ops::ShapeXPUKernel<float>, ops::ShapeXPUKernel<double>);
 
 #endif
diff --git a/paddle/fluid/operators/shard_index_op.cc b/paddle/fluid/operators/shard_index_op.cc
index 54555e494ffe5..053a90f2fc9fa 100644
--- a/paddle/fluid/operators/shard_index_op.cc
+++ b/paddle/fluid/operators/shard_index_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/shard_index_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,27 +23,6 @@ namespace operators {
 class ShardIndexOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ShardIndex");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ShardIndex");
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      platform::errors::InvalidArgument(
-                          "Rank of Input(X) should be at least 2, "
-                          "but the value given is %d.",
-                          x_dims.size()));
-    if (ctx->IsRuntime() || x_dims[x_dims.size() - 1] > 0) {
-      PADDLE_ENFORCE_EQ(x_dims[x_dims.size() - 1], 1U,
-                        platform::errors::InvalidArgument(
-                            "The last dimension of Input(X) should be 1, "
-                            "but the value given is %d.",
-                            x_dims[x_dims.size() - 1]));
-    }
-
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /* --> */ "Out");
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -114,7 +96,10 @@ the original index should be recalculated (i.e. sharded) before.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(shard_index, ops::ShardIndexOp,
-                             ops::ShardIndexOpMaker);
-REGISTER_OP_CPU_KERNEL(shard_index, ops::ShardIndexCPUKernel<int>,
-                       ops::ShardIndexCPUKernel<int64_t>);
+DECLARE_INFER_SHAPE_FUNCTOR(shard_index, ShardIndexInferShapeFunctor,
+                            PD_INFER_META(phi::ShardIndexInferMeta));
+REGISTER_OPERATOR(
+    shard_index, ops::ShardIndexOp, ops::ShardIndexOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ShardIndexInferShapeFunctor);
diff --git a/paddle/fluid/operators/shard_index_op.cu b/paddle/fluid/operators/shard_index_op.cu
deleted file mode 100644
index 115b3f47d664b..0000000000000
--- a/paddle/fluid/operators/shard_index_op.cu
+++ /dev/null
@@ -1,96 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/shard_index_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <typename T>
-__global__ void ShardIndexInner(const T* in_data, T* out_data,
-                                const int64_t numel, const int index_num,
-                                const int nshards, const int shard_id,
-                                const int ignore_value) {
-  int shard_size = (index_num + nshards - 1) / nshards;
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < numel) {
-    assert(in_data[idx] >= 0 && in_data[idx] < index_num);
-    if (in_data[idx] / shard_size == shard_id) {
-      out_data[idx] = in_data[idx] % shard_size;
-    } else {
-      out_data[idx] = ignore_value;
-    }
-  }
-}
-
-using LoDTensor = framework::LoDTensor;
-
-template <typename T>
-class ShardIndexCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int index_num = context.Attr<int>("index_num");
-    int nshards = context.Attr<int>("nshards");
-    int shard_id = context.Attr<int>("shard_id");
-    int ignore_value = context.Attr<int>("ignore_value");
-    PADDLE_ENFORCE_GT(
-        index_num, 0,
-        platform::errors::InvalidArgument(
-            "The value 'index_num' for Op(shard_index) must be greater than 0, "
-            "but the value given is %d.",
-            index_num));
-    PADDLE_ENFORCE_GT(nshards, 0,
-                      platform::errors::InvalidArgument(
-                          "The value 'nshard' for Op(shard_index) must be "
-                          "greater than 0, but the value given is %d.",
-                          nshards));
-    PADDLE_ENFORCE_GE(
-        shard_id, 0,
-        platform::errors::InvalidArgument(
-            "The value 'shard_id' for Op(shard_index) must be greater or "
-            "equal to 0, but the value given is %d.",
-            shard_id));
-    PADDLE_ENFORCE_LT(
-        shard_id, nshards,
-        platform::errors::InvalidArgument(
-            "The value 'shard_id' for Op(shard_index) must be less than "
-            "nshards (%d), but the value given is %d.",
-            nshards, shard_id));
-
-    out->Resize(in->dims());
-    out->set_lod(in->lod());
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-    ShardIndexInner<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                          PADDLE_CUDA_NUM_THREADS,
-                      PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        in_data, out_data, numel, index_num, nshards, shard_id, ignore_value);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(shard_index, ops::ShardIndexCUDAKernel<int>,
-                        ops::ShardIndexCUDAKernel<int64_t>);
diff --git a/paddle/fluid/operators/shard_index_op.h b/paddle/fluid/operators/shard_index_op.h
deleted file mode 100644
index c2fe3711686d4..0000000000000
--- a/paddle/fluid/operators/shard_index_op.h
+++ /dev/null
@@ -1,84 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-template <typename T>
-class ShardIndexCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int index_num = context.Attr<int>("index_num");
-    int nshards = context.Attr<int>("nshards");
-    int shard_id = context.Attr<int>("shard_id");
-    int ignore_value = context.Attr<int>("ignore_value");
-    PADDLE_ENFORCE_GT(
-        index_num, 0,
-        platform::errors::InvalidArgument(
-            "The value 'index_num' for Op(shard_index) must be greater than 0, "
-            "but the value given is %d.",
-            index_num));
-    PADDLE_ENFORCE_GT(nshards, 0,
-                      platform::errors::InvalidArgument(
-                          "The value 'nshard' for Op(shard_index) must be "
-                          "greater than 0, but the value given is %d.",
-                          nshards));
-    PADDLE_ENFORCE_GE(
-        shard_id, 0,
-        platform::errors::InvalidArgument(
-            "The value 'shard_id' for Op(shard_index) must be greater or "
-            "equal to 0, but the value given is %d.",
-            shard_id));
-    PADDLE_ENFORCE_LT(
-        shard_id, nshards,
-        platform::errors::InvalidArgument(
-            "The value 'shard_id' for Op(shard_index) must be less than "
-            "nshards (%d), but the value given is %d.",
-            nshards, shard_id));
-
-    int shard_size = (index_num + nshards - 1) / nshards;
-
-    out->Resize(in->dims());
-    out->set_lod(in->lod());
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    for (int64_t i = 0; i < numel; ++i) {
-      PADDLE_ENFORCE_GE(in_data[i], 0,
-                        platform::errors::InvalidArgument(
-                            "The input_index for Op(shard_index) must be "
-                            "greater or equal to 0, but the value given is %d.",
-                            in_data[i]));
-      PADDLE_ENFORCE_LT(in_data[i], index_num,
-                        platform::errors::InvalidArgument(
-                            "The input_index for Op(shard_index) must be less "
-                            "than index_num (%d), but the value given is %d.",
-                            index_num, in_data[i]));
-      if (in_data[i] / shard_size == shard_id) {
-        out_data[i] = in_data[i] % shard_size;
-      } else {
-        out_data[i] = ignore_value;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/shard_index_op_npu.cc b/paddle/fluid/operators/shard_index_op_npu.cc
index dc2e8ad58f31c..c875448424a24 100644
--- a/paddle/fluid/operators/shard_index_op_npu.cc
+++ b/paddle/fluid/operators/shard_index_op_npu.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/shard_index_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 374992096605b..3840b99dd176d 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
@@ -23,6 +24,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
 
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -30,30 +35,6 @@ class SoftmaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"), true,
-        platform::errors::NotFound("Input(X) of SoftmaxOp is not found."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"), true,
-        platform::errors::NotFound("Output(Out) of SoftmaxOp is not found."));
-
-    auto dim_x = ctx->GetInputDim("X");
-    auto rank_x = dim_x.size();
-    auto axis = ctx->Attrs().Get<int>("axis");
-    PADDLE_ENFORCE_GE(axis, -rank_x,
-                      platform::errors::InvalidArgument(
-                          "Attr(axis) value should be in range [-R, R-1], "
-                          "R is the rank of Input(X)."));
-    PADDLE_ENFORCE_LT(axis, rank_x,
-                      platform::errors::InvalidArgument(
-                          "Attr(axis) value should be in range [-R, R-1], "
-                          "R is the rank of Input(X)."));
-
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -168,23 +149,6 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Out"), true,
-        platform::errors::InvalidArgument("Input(Out) is not found."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput(framework::GradVarName("Out")), true,
-        platform::errors::InvalidArgument("Input(Out@GRAD) is not found."));
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputDim("Out"),
-        ctx->GetInputDim(framework::GradVarName("Out")),
-        platform::errors::InvalidArgument("Input(Out) and its gradients "
-                                          "should have a same shape."));
-
-    ctx->SetOutputDim(framework::GradVarName("X"),
-                      ctx->GetInputDim(framework::GradVarName("Out")));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -244,9 +208,14 @@ DECLARE_INPLACE_OP_INFERER(SoftmaxInplaceInferer, {"X", "Out"});
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(softmax, SoftmaxInferShapeFunctor,
+                            PD_INFER_META(phi::SoftmaxInferMeta));
 REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
                   ops::SoftmaxOpInferVarType,
                   ops::SoftmaxOpGradMaker<paddle::framework::OpDesc>,
                   ops::SoftmaxOpGradMaker<paddle::imperative::OpBase>,
-                  ops::SoftmaxInplaceInferer);
-REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
+                  ops::SoftmaxInplaceInferer, SoftmaxInferShapeFunctor);
+DECLARE_INFER_SHAPE_FUNCTOR(softmax_grad, SoftmaxGradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralUnaryGradInferMeta));
+REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad,
+                  SoftmaxGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc
index 3bc55fafd81e1..3148b31a8322e 100644
--- a/paddle/fluid/operators/softmax_op_npu_test.cc
+++ b/paddle/fluid/operators/softmax_op_npu_test.cc
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 19a395e72314d..41545a1ca20b2 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -760,8 +760,9 @@ static void SoftmaxWithCrossEntropyHardLabel(
 */
 template <typename T, typename LabelT>
 __global__ void SoftmaxWithCrossEntropyGradHardLabel(
-    T* logits_grad, const T* loss_grad, const LabelT* labels, const int64_t n,
-    const int64_t dim, const int64_t d, const int ignore_index) {
+    T* logits_grad, const T* loss_grad, const T* softmax, const LabelT* labels,
+    const int64_t n, const int64_t dim, const int64_t d,
+    const int ignore_index) {
   int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
   int64_t idx_n = idx / (d * dim);
   int64_t idx_dim = (idx / d) % dim;
@@ -773,10 +774,9 @@ __global__ void SoftmaxWithCrossEntropyGradHardLabel(
     if (lbl == ignore_index) {
       logits_grad[idx] = static_cast<T>(0.0);
     } else if (lbl == idx_dim) {
-      logits_grad[idx] =
-          (logits_grad[idx] - static_cast<T>(1.0)) * loss_grad[ids];
+      logits_grad[idx] = (softmax[idx] - static_cast<T>(1.0)) * loss_grad[ids];
     } else {
-      logits_grad[idx] *= loss_grad[ids];
+      logits_grad[idx] = softmax[idx] * loss_grad[ids];
     }
   }
 }
@@ -1395,11 +1395,20 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
     Tensor* logit_grad =
         context.Output<Tensor>(framework::GradVarName("Logits"));
     const Tensor* softmax = context.Input<Tensor>("Softmax");
-    if (logit_grad != softmax) {
+    auto stream = context.cuda_device_context().stream();
+    auto ignore_index = context.Attr<int>("ignore_index");
+    auto use_softmax = context.Attr<bool>("use_softmax");
+
+    T* logit_grad_data = nullptr;
+    bool copy_flag = (logit_grad != softmax && (!use_softmax || soft_label));
+    if (copy_flag) {
       framework::TensorCopy(*softmax, context.GetPlace(),
                             context.device_context(), logit_grad);
+      logit_grad_data = logit_grad->template data<T>();
+    } else {
+      logit_grad_data =
+          logit_grad->template mutable_data<T>(context.GetPlace());
     }
-    T* logit_grad_data = logit_grad->template data<T>();
 
     const int rank = logit_grad->dims().size();
     const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
@@ -1414,9 +1423,6 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
 #else
     int block = 512;
 #endif
-    auto stream = context.cuda_device_context().stream();
-    auto ignore_index = context.Attr<int>("ignore_index");
-    auto use_softmax = context.Attr<bool>("use_softmax");
 
     // do not with softmax op, and input is softmax
     if (!use_softmax) {
@@ -1451,11 +1457,12 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
       SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
           logit_grad_data, loss_grad_data, label_data, n, d, remain);
     } else {
+      const T* softmax_data = softmax->template data<T>();
       const auto* label_data = labels.template data<LabelT>();
       int grid = (n * d + block - 1) / block;
       SoftmaxWithCrossEntropyGradHardLabel<T><<<grid, block, 0, stream>>>(
-          logit_grad_data, loss_grad_data, label_data, n, d / remain, remain,
-          ignore_index);
+          logit_grad_data, loss_grad_data, softmax_data, label_data, n,
+          d / remain, remain, ignore_index);
     }
   }
 };
diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
index bff8061814ae6..aa944cfcfbb17 100644
--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -16,9 +16,10 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/pooling.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 namespace paddle {
 namespace operators {
@@ -53,14 +54,20 @@ class SppKernel : public framework::OpKernel<T> {
       out_level.mutable_data<T>(output_shape, context.GetPlace());
       // pooling
       if (pooling_type == "max") {
-        math::Pool2dFunctor<DeviceContext, math::MaxPool<T>, T> pool_forward;
-        math::MaxPool<T> max_process;
+        phi::funcs::Pool2dFunctor<
+            typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
+            phi::funcs::MaxPool<T>, T>
+            pool_forward;
+        phi::funcs::MaxPool<T> max_process;
         pool_forward(context.template device_context<DeviceContext>(), *in_x,
                      kernel_size, strides, paddings, true, false, &out_level,
                      max_process);
       } else if (pooling_type == "avg") {
-        math::Pool2dFunctor<DeviceContext, math::AvgPool<T>, T> pool_forward;
-        math::AvgPool<T> avg_process;
+        phi::funcs::Pool2dFunctor<
+            typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
+            phi::funcs::AvgPool<T>, T>
+            pool_forward;
+        phi::funcs::AvgPool<T> avg_process;
         pool_forward(context.template device_context<DeviceContext>(), *in_x,
                      kernel_size, strides, paddings, true, false, &out_level,
                      avg_process);
@@ -95,7 +102,9 @@ class SppGradKernel : public framework::OpKernel<T> {
     std::string pooling_type =
         context.template Attr<std::string>("pooling_type");
     auto& device_ctx = context.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> zero;
+    phi::funcs::SetConstant<
+        typename framework::ConvertToPhiContext<DeviceContext>::TYPE, T>
+        zero;
     in_x_grad->mutable_data<T>(context.GetPlace());
     zero(device_ctx, in_x_grad, static_cast<T>(0));
     auto out_stride = phi::stride(out->dims());
@@ -145,14 +154,18 @@ class SppGradKernel : public framework::OpKernel<T> {
       outgrad_level.Resize(out_shape);
       // pooling backward
       if (pooling_type == "max") {
-        math::MaxPool2dGradFunctor<DeviceContext, T> pool2d_backward;
+        phi::funcs::MaxPool2dGradFunctor<
+            typename framework::ConvertToPhiContext<DeviceContext>::TYPE, T>
+            pool2d_backward;
         pool2d_backward(context.template device_context<DeviceContext>(), *in_x,
                         *&out_level, *&outgrad_level, kernel_size, strides,
                         paddings, in_x_grad);
       } else if (pooling_type == "avg") {
-        math::Pool2dGradFunctor<DeviceContext, math::AvgPoolGrad<T>, T>
+        phi::funcs::Pool2dGradFunctor<
+            typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
+            phi::funcs::AvgPoolGrad<T>, T>
             pool_backward;
-        math::AvgPoolGrad<T> avg_process;
+        phi::funcs::AvgPoolGrad<T> avg_process;
         pool_backward(context.template device_context<DeviceContext>(), *in_x,
                       *&out_level, *&outgrad_level, kernel_size, strides,
                       paddings, true, false, in_x_grad, avg_process);
diff --git a/paddle/fluid/operators/squeeze_op.h b/paddle/fluid/operators/squeeze_op.h
index 58e5440689926..a776a78616b8d 100644
--- a/paddle/fluid/operators/squeeze_op.h
+++ b/paddle/fluid/operators/squeeze_op.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc
index 956544c53609e..d61f5aa3f634c 100644
--- a/paddle/fluid/operators/squeeze_op_npu_test.cc
+++ b/paddle/fluid/operators/squeeze_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cc b/paddle/fluid/operators/sync_batch_norm_op.cc
index d198992abde7d..0c178b02d0309 100644
--- a/paddle/fluid/operators/sync_batch_norm_op.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op.cc
@@ -50,6 +50,7 @@ class SyncBatchNormGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
 REGISTER_OPERATOR(sync_batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
                   ops::BatchNormOpInferVarType,
                   ops::SyncBatchNormGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc
index dc12f8e8892a0..e179149c5bb77 100644
--- a/paddle/fluid/operators/tile_op.cc
+++ b/paddle/fluid/operators/tile_op.cc
@@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/tile_op.h"
 #include <memory>
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -26,66 +30,6 @@ class TileOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Tile");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Tile");
-    auto x_dims = ctx->GetInputDim("X");
-    auto repeat_times = ctx->Attrs().Get<std::vector<int>>("repeat_times");
-    if (repeat_times.size() == 0) {
-      repeat_times = std::vector<int>(x_dims.size(), -1);
-    }
-
-    PADDLE_ENFORCE_LE(
-        x_dims.size(), MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'x' for tile op "
-            "must not be greater than %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED, x_dims.size()));
-    PADDLE_ENFORCE_LE(
-        repeat_times.size(), MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The size of the shape of input 'repeat_times' for tile op "
-            "must not be greater than %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED, repeat_times.size()));
-    PADDLE_ENFORCE_GE(
-        repeat_times.size(), 1,
-        platform::errors::InvalidArgument(
-            "The size of the shape of input 'repeat_times' for tile op "
-            "must be positive integers, but the value received is %d.",
-            repeat_times.size()));
-
-    auto out_rank =
-        std::max(static_cast<size_t>(x_dims.size()), repeat_times.size());
-    std::vector<int64_t> out_shape(out_rank);
-    auto x_dim_vec = phi::vectorize<int>(x_dims);
-    if (x_dim_vec.size() > repeat_times.size()) {
-      auto diff = x_dim_vec.size() - repeat_times.size();
-      repeat_times.insert(repeat_times.begin(), diff, -1);
-    } else {
-      auto diff = repeat_times.size() - x_dim_vec.size();
-      x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
-    }
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      if (x_dim_vec[i] == -1 || repeat_times[i] == -1) {
-        out_shape[i] = -1;
-      } else {
-        PADDLE_ENFORCE_GT(
-            repeat_times[i], 0,
-            platform::errors::InvalidArgument(
-                "Every element of the input 'repeat_times' for tile op must be "
-                "greater than 0, but the value given is %d.",
-                repeat_times[i]));
-        out_shape[i] = x_dim_vec[i] * repeat_times[i];
-      }
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim(out_shape));
-    if (out_shape[0] == x_dims[0]) {
-      ctx->ShareLoD("X", "Out");
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -268,38 +212,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(TileGradNoNeedBufVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(tile, TileInferMetaFunctor,
+                            PD_INFER_META(phi::TileInferMeta));
+
 REGISTER_OPERATOR(tile, ops::TileOp, ops::TileOpMaker,
                   ops::TileGradOpMaker<paddle::framework::OpDesc>,
-                  ops::TileGradOpMaker<paddle::imperative::OpBase>);
+                  ops::TileGradOpMaker<paddle::imperative::OpBase>,
+                  TileInferMetaFunctor);
 REGISTER_OPERATOR(tile_grad, ops::TileGradOp,
                   ops::TileDoubleGradOpMaker<paddle::framework::OpDesc>,
                   ops::TileDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::TileGradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    tile, ops::TileKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TileKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TileKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TileKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TileKernel<paddle::platform::CPUDeviceContext, bool>);
-REGISTER_OP_CPU_KERNEL(
-    tile_grad, ops::TileGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TileGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TileGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TileGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL(
-    tile, ops::TileKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::float16>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    tile_grad, ops::TileGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
-#endif
diff --git a/paddle/fluid/operators/tile_op.h b/paddle/fluid/operators/tile_op.h
deleted file mode 100644
index 1698b5e3c6322..0000000000000
--- a/paddle/fluid/operators/tile_op.h
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-#define MAX_RANK_SUPPORTED 6
-
-namespace paddle {
-namespace operators {
-inline std::vector<int> get_repeat_times(
-    const framework::ExecutionContext& ctx) {
-  if (ctx.HasInput("RepeatTimes")) {
-    auto* repeat_tensor = ctx.Input<framework::LoDTensor>("RepeatTimes");
-    auto* repeat_data = repeat_tensor->data<int>();
-    framework::Tensor cpu_repeat_tensor;
-    if (platform::is_gpu_place(repeat_tensor->place()) ||
-        platform::is_xpu_place(repeat_tensor->place()) ||
-        platform::is_npu_place(repeat_tensor->place())) {
-      paddle::framework::TensorCopySync(*repeat_tensor, platform::CPUPlace(),
-                                        &cpu_repeat_tensor);
-      repeat_data = cpu_repeat_tensor.data<int>();
-    }
-    auto vec_repeat_times =
-        std::vector<int>(repeat_data, repeat_data + repeat_tensor->numel());
-    return vec_repeat_times;
-  }
-
-  auto list_repeat_times_tensor =
-      ctx.MultiInput<framework::Tensor>("repeat_times_tensor");
-  if (list_repeat_times_tensor.size() > 0) {
-    // get tensor from
-    std::vector<int> vec_repeat_times;
-    for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) {
-      auto tensor = list_repeat_times_tensor[i];
-      if (platform::is_gpu_place(tensor->place()) ||
-          platform::is_xpu_place(tensor->place()) ||
-          platform::is_npu_place(tensor->place())) {
-        framework::Tensor temp;
-        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-        vec_repeat_times.push_back(*temp.data<int32_t>());
-      } else {
-        vec_repeat_times.push_back(*tensor->data<int32_t>());
-      }
-    }
-    return vec_repeat_times;
-  } else {
-    return ctx.Attr<std::vector<int>>("repeat_times");
-  }
-}
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-using framework::To32BitIndex;
-
-template <typename DeviceContext, typename T>
-class TileKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<Tensor>("X")->dims().size();
-    PADDLE_ENFORCE_GE(
-        rank, 1, platform::errors::InvalidArgument(
-                     "The rank of the input 'x' for tile op must be a positive "
-                     "integer, but the value received is %d.",
-                     rank));
-    PADDLE_ENFORCE_LE(
-        rank, MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'x' for tile op "
-            "must be less than or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED, rank));
-    auto repeat_times = get_repeat_times(context);
-    int repeat_times_size = repeat_times.size();
-    PADDLE_ENFORCE_GE(
-        repeat_times_size, 1,
-        platform::errors::InvalidArgument(
-            "The number of elements of the input 'repeat_times' for tile "
-            "op must be positive, but the value received is %d.",
-            repeat_times_size));
-    PADDLE_ENFORCE_LE(
-        repeat_times_size, MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The number of elements of the input 'repeat_times' for tile op "
-            "must be less than or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED, repeat_times_size));
-    rank = std::max(rank, repeat_times_size);
-    switch (rank) {
-      case 1:
-        Tile<1>(context);
-        break;
-      case 2:
-        Tile<2>(context);
-        break;
-      case 3:
-        Tile<3>(context);
-        break;
-      case 4:
-        Tile<4>(context);
-        break;
-      case 5:
-        Tile<5>(context);
-        break;
-      case 6:
-        Tile<6>(context);
-        break;
-    }
-  }
-
- protected:
-  template <int Rank>
-  void Tile(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<Tensor>("X");
-
-    auto in_dims = in0->dims();
-    auto repeat_times = get_repeat_times(context);
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      PADDLE_ENFORCE_GT(
-          repeat_times[i], 0,
-          platform::errors::InvalidArgument(
-              "All elements of the input 'repeat_times' for tile op must "
-              "be positive integers, but the value received is %d.",
-              repeat_times[i]));
-    }
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    if (repeat_times.size() < vec_in_dims.size()) {
-      int diff = vec_in_dims.size() - repeat_times.size();
-      repeat_times.insert(repeat_times.begin(), diff, 1);
-    } else {
-      int diff = repeat_times.size() - vec_in_dims.size();
-      vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    }
-    PADDLE_ENFORCE_EQ(
-        repeat_times.size(), vec_in_dims.size(),
-        platform::errors::InvalidArgument(
-            "The rank (%d) of the input 'x' and the rank (%d) of the input "
-            "'repeat_times' for tile op must match after promotion.",
-            vec_in_dims.size(), repeat_times.size()));
-    auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      bcast_dims[i] = repeat_times[i];
-    }
-
-    framework::DDim new_in_dims = phi::make_ddim(vec_in_dims);
-    framework::DDim out_dims(new_in_dims);
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      out_dims[i] *= repeat_times[i];
-    }
-
-    out0->Resize(out_dims);
-    auto x = EigenTensor<T, Rank>::From(*in0, new_in_dims);
-    out0->mutable_data<T>(context.GetPlace());
-    auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    // use 32-bit index to speed up
-    bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
-    if (use_32bit_index) {
-      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
-          place, To32BitIndex(y), To32BitIndex(x), bcast_dims);
-    } else {
-      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
-                                                                   bcast_dims);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TileGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto repeat_times = get_repeat_times(context);
-    auto x_dims = x->dims();
-    auto vec_in_dims = phi::vectorize<int>(x_dims);
-    if (repeat_times.size() < vec_in_dims.size()) {
-      int diff = vec_in_dims.size() - repeat_times.size();
-      repeat_times.insert(repeat_times.begin(), diff, 1);
-    } else {
-      int diff = repeat_times.size() - vec_in_dims.size();
-      vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    }
-    // 1. reshape_dims_vec is the broadcast parameter.
-    // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
-    //    each dimension expanded, the gradients should be summed to original
-    //    size.
-    std::vector<int> reshape_dims_vec;
-    std::vector<int> reduce_dims_vec;
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      reduce_dims_vec.push_back(reshape_dims_vec.size());
-      reshape_dims_vec.push_back(repeat_times[i]);
-      reshape_dims_vec.push_back(vec_in_dims[i]);
-    }
-
-    int dims = reduce_dims_vec.size();
-
-    bool just_copy = true;
-    for (size_t i = 0; i < repeat_times.size(); i++) {
-      if (repeat_times[i] != 1) {
-        just_copy = false;
-        break;
-      }
-    }
-    // no need reduce, just copy
-    if (just_copy) {
-      auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
-      auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-      dx->mutable_data<T>(context.GetPlace());
-      framework::TensorCopy(*dout, context.GetPlace(), context.device_context(),
-                            dx);
-      // TensorCopy may change the dims of dx
-      dx->Resize(x_dims);
-    } else {
-      PADDLE_ENFORCE_GE(dims, 1,
-                        platform::errors::InvalidArgument(
-                            "Th rank of the input 'Out@GRAD' for tile_grad op "
-                            " must be greater than or equal to 1, but "
-                            "the value received is %d.",
-                            dims));
-      PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED,
-                        platform::errors::InvalidArgument(
-                            "The rank of the input 'Out@GRAD' for tile_grad op "
-                            "must be less than or equal "
-                            "to %d, but the value received is %d.",
-                            MAX_RANK_SUPPORTED, dims));
-      switch (dims) {
-        case 1:
-          TileBackward<1>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 2:
-          TileBackward<2>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 3:
-          TileBackward<3>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 4:
-          TileBackward<4>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 5:
-          TileBackward<5>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 6:
-          TileBackward<6>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Only support tensor with rank being between 1 and 6. But "
-              "received tensor's rank = %d.",
-              dims));
-      }
-    }
-  }
-
- protected:
-  template <int Dims>
-  void TileBackward(const framework::ExecutionContext& context,
-                    const std::vector<int>& reshape_dims_vec,
-                    const std::vector<int>& reduce_dims_vec) const {
-    size_t reshape_size = reshape_dims_vec.size();
-    size_t reduce_size = reduce_dims_vec.size();
-    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-    out0->mutable_data<T>(context.GetPlace());
-    auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
-    for (size_t i = 0; i < reshape_size; ++i) {
-      reshape_dims[i] = reshape_dims_vec[i];
-    }
-    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
-    for (size_t i = 0; i < reduce_size; ++i) {
-      reduce_dims[i] = reduce_dims_vec[i];
-    }
-
-    auto out_grad = EigenVector<T>::Flatten(*in0);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
-        place, x_grad, out_grad, reduce_dims, reshape_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/tile_op_functor.h b/paddle/fluid/operators/tile_op_functor.h
new file mode 100644
index 0000000000000..95bfb9f4e1a9d
--- /dev/null
+++ b/paddle/fluid/operators/tile_op_functor.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+
+#include "paddle/fluid/framework/operator.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+namespace paddle {
+namespace operators {
+
+inline std::vector<int> get_repeat_times(
+    const framework::ExecutionContext& ctx) {
+  if (ctx.HasInput("RepeatTimes")) {
+    auto* repeat_tensor = ctx.Input<framework::LoDTensor>("RepeatTimes");
+    auto* repeat_data = repeat_tensor->data<int>();
+    framework::Tensor cpu_repeat_tensor;
+    if (platform::is_gpu_place(repeat_tensor->place()) ||
+        platform::is_xpu_place(repeat_tensor->place()) ||
+        platform::is_npu_place(repeat_tensor->place())) {
+      paddle::framework::TensorCopySync(*repeat_tensor, platform::CPUPlace(),
+                                        &cpu_repeat_tensor);
+      repeat_data = cpu_repeat_tensor.data<int>();
+    }
+    auto vec_repeat_times =
+        std::vector<int>(repeat_data, repeat_data + repeat_tensor->numel());
+    return vec_repeat_times;
+  }
+
+  auto list_repeat_times_tensor =
+      ctx.MultiInput<framework::Tensor>("repeat_times_tensor");
+  if (list_repeat_times_tensor.size() > 0) {
+    // get tensor from
+    std::vector<int> vec_repeat_times;
+    for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) {
+      auto tensor = list_repeat_times_tensor[i];
+      if (platform::is_gpu_place(tensor->place()) ||
+          platform::is_xpu_place(tensor->place()) ||
+          platform::is_npu_place(tensor->place())) {
+        framework::Tensor temp;
+        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+        vec_repeat_times.push_back(*temp.data<int32_t>());
+      } else {
+        vec_repeat_times.push_back(*tensor->data<int32_t>());
+      }
+    }
+    return vec_repeat_times;
+  } else {
+    return ctx.Attr<std::vector<int>>("repeat_times");
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc
index 9e306c7be537b..cea6b458aec78 100644
--- a/paddle/fluid/operators/tile_op_npu.cc
+++ b/paddle/fluid/operators/tile_op_npu.cc
@@ -11,7 +11,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/tile_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/tile_op_functor.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/tile_op_xpu.cc b/paddle/fluid/operators/tile_op_xpu.cc
index 6b60b167a2465..598377587d6f7 100644
--- a/paddle/fluid/operators/tile_op_xpu.cc
+++ b/paddle/fluid/operators/tile_op_xpu.cc
@@ -11,11 +11,14 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/tile_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/tile_op_functor.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename T>
 class TileXPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
index d60976928e00c..80c9935057cb5 100644
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -51,6 +51,19 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
+inline void GetDims(const phi::DDim& dim, int axis, int* pre, int* n,
+                    int* post) {
+  *pre = 1;
+  *post = 1;
+  *n = dim[axis];
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= dim[i];
+  }
+  for (int i = axis + 1; i < dim.size(); ++i) {
+    (*post) *= dim[i];
+  }
+}
+
 struct SegmentOffsetIter {
   EIGEN_DEVICE_FUNC
   explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc
index 810afc901df57..0a9ae789b01ee 100644
--- a/paddle/fluid/operators/top_k_v2_op.cc
+++ b/paddle/fluid/operators/top_k_v2_op.cc
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/top_k_v2_op.h"
 #include <memory>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -22,56 +25,6 @@ class TopkV2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "topk_v2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "topk_v2");
-    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "topk_v2");
-
-    auto input_dims = ctx->GetInputDim("X");
-    const int& dim_size = input_dims.size();
-    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
-    PADDLE_ENFORCE_EQ(
-        (axis < dim_size) && (axis >= (-1 * dim_size)), true,
-        paddle::platform::errors::InvalidArgument(
-            "the axis of topk must be [-%d, %d), but you set axis is %d",
-            dim_size, dim_size, axis));
-
-    if (axis < 0) axis += dim_size;
-
-    int k;
-    auto k_is_tensor = ctx->HasInput("K");
-    if (k_is_tensor) {
-      k = -1;
-    } else {
-      k = static_cast<int>(ctx->Attrs().Get<int>("k"));
-      PADDLE_ENFORCE_EQ(k >= 1, true,
-                        paddle::platform::errors::InvalidArgument(
-                            "the attribute of k in the topk must >= 1 or be a "
-                            "Tensor, but received %d .",
-                            k));
-    }
-
-    PADDLE_ENFORCE_GE(input_dims.size(), 1,
-                      paddle::platform::errors::InvalidArgument(
-                          "input of topk must have >= 1d shape"));
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_GE(
-          input_dims[axis], k,
-          paddle::platform::errors::InvalidArgument(
-              "input of topk op must have >= %d columns in axis of %d", k,
-              axis));
-    }
-
-    framework::DDim dims = input_dims;
-
-    dims[axis] = k;
-    ctx->SetOutputDim("Out", dims);
-    ctx->SetOutputDim("Indices", dims);
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -168,20 +121,11 @@ class TopkV2GradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(top_k_v2, TopKInferShapeFunctor,
+                            PD_INFER_META(phi::TopKInferMeta));
 REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker,
                   ops::TopkV2GradOpMaker<paddle::framework::OpDesc>,
-                  ops::TopkV2GradOpMaker<paddle::imperative::OpBase>);
+                  ops::TopkV2GradOpMaker<paddle::imperative::OpBase>,
+                  TopKInferShapeFunctor);
 
 REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad);
-
-REGISTER_OP_CPU_KERNEL(top_k_v2,
-                       ops::TopkV2Kernel<paddle::platform::CPUPlace, float>,
-                       ops::TopkV2Kernel<paddle::platform::CPUPlace, double>,
-                       ops::TopkV2Kernel<paddle::platform::CPUPlace, int32_t>,
-                       ops::TopkV2Kernel<paddle::platform::CPUPlace, int64_t>)
-
-REGISTER_OP_CPU_KERNEL(
-    top_k_v2_grad, ops::TopkV2GradKernel<paddle::platform::CPUPlace, float>,
-    ops::TopkV2GradKernel<paddle::platform::CPUPlace, double>,
-    ops::TopkV2GradKernel<paddle::platform::CPUPlace, int32_t>,
-    ops::TopkV2GradKernel<paddle::platform::CPUPlace, int64_t>)
diff --git a/paddle/fluid/operators/top_k_v2_op.cu b/paddle/fluid/operators/top_k_v2_op.cu
deleted file mode 100644
index 84d8eef53bf72..0000000000000
--- a/paddle/fluid/operators/top_k_v2_op.cu
+++ /dev/null
@@ -1,296 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/top_k_function_cuda.h"
-#include "paddle/fluid/operators/top_k_v2_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-#define FIXED_BLOCK_DIM_BASE(dim, ...) \
-  case (dim): {                        \
-    constexpr auto kBlockDim = (dim);  \
-    __VA_ARGS__;                       \
-  } break
-
-#define FIXED_BLOCK_DIM(...)                \
-  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
-
-template <typename DeviceContext, typename T>
-class TopkV2OpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
-
-    // get the attributes
-    int k = static_cast<int>(ctx.Attr<int>("k"));
-    int axis = static_cast<int>(ctx.Attr<int>("axis"));
-    const bool& sorted = static_cast<bool>(ctx.Attr<bool>("sorted"));
-    const bool& largest = static_cast<bool>(ctx.Attr<bool>("largest"));
-
-    // get the input dims
-    const auto& in_dims = input->dims();
-    // calcluate the real axis
-    if (axis < 0) axis += in_dims.size();
-
-    auto* k_t = ctx.Input<Tensor>("K");
-    if (k_t) {
-      Tensor k_host;
-      framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host);
-      k = k_host.data<int>()[0];
-      framework::DDim output_dims = output->dims();
-      output_dims[axis] = k;
-      output->Resize(output_dims);
-      indices->Resize(output_dims);
-    }
-
-    const auto& out_dims = output->dims();
-
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    if (axis == in_dims.size() - 1) {
-      // if get the topK from the last axis
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
-
-      if (k > input_width) k = input_width;
-
-      // The conclusion is drawn from the data through multiple sets of
-      // statistics
-      if (input_width >= 128 && k >= input_width * 0.75) {
-        if (SortTopk<T>(dev_ctx, input, input_width, input_height, k, output,
-                        indices, largest)) {
-          // Successed, return.
-          return;
-        } else {
-          LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
-                       "default topk kernel.";
-        }
-      }
-
-      // NOTE: pass lds and dim same to input width.
-      // NOTE: old matrix implementation of stride is different to eigen.
-      const int kMaxHeight = 2048;
-      int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
-      switch (GetDesiredBlockDim(input_width)) {
-#ifdef PADDLE_WITH_HIP
-        FIXED_BLOCK_DIM(
-            KeMatrixTopK<T, 20,
-                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-                output_data, k, indices_data, input_data, input_width,
-                input_width, static_cast<int>(k), gridx, input_height,
-                largest));
-#else
-        FIXED_BLOCK_DIM(
-            KeMatrixTopK<T, 5,
-                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-                output_data, k, indices_data, input_data, input_width,
-                input_width, static_cast<int>(k), gridx, input_height,
-                largest));
-#endif
-        default:
-          PADDLE_THROW(platform::errors::Fatal(
-              "the input data shape has error in the topk cuda kernel."));
-      }
-    } else {
-      // if get topK not from the last axis, will tranpose the tensor and get
-      // TopK
-
-      // first step, prepare the trans args for the tranpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-
-      framework::DDim trans_dims(in_dims);
-      framework::DDim trans_out_dims(output->dims());
-      for (int i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-        trans_out_dims[i] = out_dims[trans[i]];
-      }
-      // second step, tranpose the input
-      Tensor trans_input;
-      trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
-                                                   &trans_input, trans);
-      // third step, calcluate the topk
-      // allocate the tmp cuda memory for the tmp result
-      Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_out_dims, ctx.GetPlace());
-      Tensor trans_out;
-      trans_out.mutable_data<T>(trans_out_dims, ctx.GetPlace());
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      if (k > input_width) k = input_width;
-
-      // The conclusion is drawn from the data through multiple sets of
-      // statistics
-      if (input_width >= 128 && k >= input_width * 0.75) {
-        if (SortTopk<T>(dev_ctx, &trans_input, input_width, input_height, k,
-                        &trans_out, &trans_ind, largest)) {
-          // last step, tranpose back the indices and output
-          TransCompute<platform::CUDADeviceContext, int64_t>(
-              ndims, dev_ctx, trans_ind, indices, trans);
-          TransCompute<platform::CUDADeviceContext, T>(
-              ndims, dev_ctx, trans_out, output, trans);
-          return;
-        } else {
-          LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
-                       "default topk kernel.";
-        }
-      }
-
-      const int kMaxHeight = 2048;
-      int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
-      switch (GetDesiredBlockDim(input_width)) {
-#ifdef PADDLE_WITH_HIP
-        FIXED_BLOCK_DIM(
-            KeMatrixTopK<T, 20,
-                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-                trans_out.data<T>(), k, trans_ind.data<int64_t>(),
-                trans_input.data<T>(), input_width, input_width,
-                static_cast<int>(k), gridx, input_height, largest));
-#else
-        FIXED_BLOCK_DIM(
-            KeMatrixTopK<T, 5,
-                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-                trans_out.data<T>(), k, trans_ind.data<int64_t>(),
-                trans_input.data<T>(), input_width, input_width,
-                static_cast<int>(k), gridx, input_height, largest));
-#endif
-        default:
-          PADDLE_THROW(platform::errors::Fatal(
-              "the input data shape has error in the topk cuda kernel."));
-      }
-
-      // last step, tranpose back the indices and output
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, trans_ind, indices, trans);
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
-                                                   output, trans);
-    }
-  }
-};
-
-#undef FIXED_BLOCK_DIM_BASE
-#undef FIXED_BLOCK_DIM
-template <typename DeviceContext, typename T>
-class TopkV2OpGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(context.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* x = context.Input<Tensor>("X");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<Tensor>("Indices");
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    int axis = context.Attr<int>("axis");
-
-    const auto& in_dims = x->dims();
-    const auto& out_dims = indices->dims();
-
-    // get the real the axis and the k
-    if (axis < 0) axis += in_dims.size();
-    const int& k = out_dims[axis];
-    const int& raw_height = in_dims[axis];
-
-    // allocate the cuda memory for the x_grad
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    const T* out_grad_data = out_grad->data<T>();
-    const int64_t* indices_data = indices->data<int64_t>();
-
-    int pre, n, post;
-    GetDims(in_dims, axis, &pre, &n, &post);
-
-    // calcluate the block and grid num
-    auto& dev_ctx = context.cuda_device_context();
-    auto ComputeBlockSize = [](int col) {
-      if (col > 512)
-        return 1024;
-      else if (col > 256 && col <= 512)
-        return 512;
-      else if (col > 128 && col <= 256)
-        return 256;
-      else if (col > 64 && col <= 128)
-        return 128;
-      else
-        return 64;
-    };
-    int block_size = ComputeBlockSize(post * k);
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
-    int grid_size = std::min(max_blocks, pre);
-
-    // lanuch the cuda kernel to assign the grad
-    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
-        out_grad_data, indices_data, x_grad_data, pre, post, n, k);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    top_k_v2,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          float>,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          double>,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          int>,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t>,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          paddle::platform::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    top_k_v2_grad, paddle::operators::TopkV2OpGradCUDAKernel<
-                       paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::TopkV2OpGradCUDAKernel<
-        paddle::platform::CUDADeviceContext, double>,
-    paddle::operators::TopkV2OpGradCUDAKernel<
-        paddle::platform::CUDADeviceContext, int>,
-    paddle::operators::TopkV2OpGradCUDAKernel<
-        paddle::platform::CUDADeviceContext, int64_t>,
-    paddle::operators::TopkV2OpGradCUDAKernel<
-        paddle::platform::CUDADeviceContext, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/top_k_v2_op.h b/paddle/fluid/operators/top_k_v2_op.h
deleted file mode 100644
index a808207476f3b..0000000000000
--- a/paddle/fluid/operators/top_k_v2_op.h
+++ /dev/null
@@ -1,335 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
-  The reason why we need the topk v2 is because the compatibility. We redefine
-  the NaN is maximum value
-  in the process of comparing. If do not add the topk v2,  will affect the
-  inference result of model that traing
-  by the older version paddlepaddle.
-*/
-
-#pragma once
-#include <algorithm>
-#include <iostream>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/top_k_op.h"
-#include "paddle/fluid/operators/transpose_op.h"
-
-namespace paddle {
-namespace operators {
-
-inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n,
-                    int* post) {
-  *pre = 1;
-  *post = 1;
-  *n = dim[axis];
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= dim[i];
-  }
-  for (int i = axis + 1; i < dim.size(); ++i) {
-    (*post) *= dim[i];
-  }
-}
-
-template <typename T, typename Type>
-static void FullTopK(Type input_height, Type input_width, int input_dim,
-                     const framework::Tensor* input, T* t_out, Type* t_indices,
-                     const int& k, const bool& largest, const bool& sorted) {
-  // when the k is small, will the partial sort
-  bool partial_sort_flag = (k * 64) < input_width;
-
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  // Eigen::DSizes<int, 2> flat2dims(input_height, input_width);
-  for (Type i = 0; i < input_height; ++i) {
-    std::vector<std::pair<T, Type>> col_vec;
-    col_vec.reserve(input_width);
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
-      }
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
-      }
-    }
-    if (partial_sort_flag) {
-      std::partial_sort(
-          col_vec.begin(), col_vec.begin() + k, col_vec.end(),
-          [&largest](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-            if (largest) {
-              return (std::isnan(static_cast<double>(l.first)) &&
-                      !std::isnan(static_cast<double>(r.first))) ||
-                     (l.first > r.first);
-            } else {
-              return (!std::isnan(static_cast<double>(l.first)) &&
-                      std::isnan(static_cast<double>(r.first))) ||
-                     (l.first < r.first);
-            }
-          });
-    } else {
-      // use the nth-element to get the K-larger or K-small element
-      if (largest) {
-        std::nth_element(
-            col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
-            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-              return (std::isnan(static_cast<double>(l.first)) &&
-                      !std::isnan(static_cast<double>(r.first))) ||
-                     (l.first > r.first);
-            });
-        // the nth-element will get the unorder elements, sort the element
-        if (sorted) {
-          std::sort(col_vec.begin(), col_vec.begin() + k - 1,
-                    [&largest](const std::pair<T, Type>& l,
-                               const std::pair<T, Type>& r) {
-                      return (std::isnan(static_cast<double>(l.first)) &&
-                              !std::isnan(static_cast<double>(r.first))) ||
-                             (l.first > r.first);
-                    });
-        }
-      } else {
-        std::nth_element(
-            col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
-            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-              return (!std::isnan(static_cast<double>(l.first)) &&
-                      std::isnan(static_cast<double>(r.first))) ||
-                     (l.first < r.first);
-            });
-        // the nth-element will get the unorder elements, sort the element
-        if (sorted) {
-          std::sort(
-              col_vec.begin(), col_vec.begin() + k - 1,
-              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-                return (!std::isnan(static_cast<double>(l.first)) &&
-                        std::isnan(static_cast<double>(r.first))) ||
-                       (l.first < r.first);
-              });
-        }
-      }
-    }
-    for (Type j = 0; j < k; ++j) {
-      t_out[i * k + j] = col_vec[j].first;
-      t_indices[i * k + j] = col_vec[j].second;
-    }
-  }
-}
-
-template <typename T, typename Type>
-static void FullTopKAssign(const Type& input_height, const Type& input_width,
-                           const int& input_dim, const framework::Tensor* input,
-                           const framework::Tensor* indices, T* output_data,
-                           const int& k) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
-      for (Type j = 0; j < k; ++j) {
-        output_data[i * input_width + e_indices(j)] = e_input(j);
-      }
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices =
-          framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
-      for (Type j = 0; j < k; ++j) {
-        output_data[i * input_width + e_indices(i, j)] = e_input(i, j);
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class TopkV2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // Get the top k elements of each row of input tensor
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    auto* indices = context.Output<Tensor>("Indices");
-    const auto& in_dims = input->dims();
-    int k = static_cast<int>(context.Attr<int>("k"));
-    const auto& sorted = static_cast<bool>(context.Attr<bool>("sorted"));
-    const auto& largest = static_cast<bool>(context.Attr<bool>("largest"));
-
-    // axis < 0, cacluate the real axis
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    if (axis < 0) axis += in_dims.size();
-
-    // if K tensor is not null, will the use K tesnor as k
-    auto* k_t = context.Input<Tensor>("K");
-    if (k_t) {
-      k = k_t->data<int>()[0];
-      framework::DDim output_dims = output->dims();
-      // accroding to axis to set K value in the dim
-      output_dims[axis] = k;
-      output->Resize(output_dims);
-      indices->Resize(output_dims);
-    }
-
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
-    const auto& out_dims = output->dims();
-    if (axis + 1 == in_dims.size()) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      FullTopK<T, int64_t>(input_height, input_width, in_dims.size(), input,
-                           output_data, indices_data, k, largest, sorted);
-    } else {
-      // if the topk dims is not last dim, will tranpose and do topk
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-
-      // get the trans input_dims, out_dims
-      framework::DDim trans_dims(in_dims);
-      framework::DDim trans_out_dims(output->dims());
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_out_dims[i] = out_dims[trans[i]];
-      }
-
-      Tensor trans_inp;
-      trans_inp.mutable_data<T>(trans_dims, context.GetPlace());
-      int ndims = trans.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-
-      // transpose the input value
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
-                                                  &trans_inp, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      // Allocate the temp tensor to the save the topk indices, values
-      Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_out_dims, context.GetPlace());
-      Tensor tmp_indices;
-      auto* t_ind =
-          tmp_indices.mutable_data<int64_t>(trans_out_dims, context.GetPlace());
-
-      // get the TopK value
-      FullTopK<T, int64_t>(input_height, input_width, in_dims.size(),
-                           &trans_inp, t_out, t_ind, k, largest, sorted);
-      // transpose back
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_context, tmp_indices, indices, trans);
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  output, trans);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TopkV2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<Tensor>("Indices");
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-
-    const auto& in_dims = x->dims();
-    const auto& out_dims = indices->dims();
-
-    // axis < 0, get the real axis
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    const size_t& k = out_dims[axis];
-
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    if (axis + 1 == in_dims.size()) {
-      // allocate the memory for the input_grad
-
-      // assign the out_grad to input_grad directly
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-
-      // init the output grad with 0, because some input elements has no grad
-      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
-      // Assign the output_grad to input_grad
-      FullTopKAssign(input_height, input_width, in_dims.size(), out_grad,
-                     indices, x_grad_data, k);
-    } else {
-      // can not assign grad to input_grad, must do the transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(out_dims.size() - 1);
-      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-      framework::DDim trans_dims(out_dims);
-      framework::DDim trans_in_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = out_dims[trans[i]];
-        trans_in_dims[i] = in_dims[trans[i]];
-      }
-      // transpose the out_grad, indices
-      Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_dims, context.GetPlace());
-      Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_dims, context.GetPlace());
-      int ndims = trans.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-
-      // Do transpose
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *out_grad,
-                                                  &trans_dO, trans);
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_context, *indices, &trans_ind, trans);
-      const int64_t input_height = phi::product(
-          phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
-      const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
-
-      // Assign the out_grad to tranpose input_grad
-      Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_in_dims, context.GetPlace());
-      memset(t_out, 0, x_grad->numel() * sizeof(T));
-
-      FullTopKAssign<T, int64_t>(input_height, input_width, in_dims.size(),
-                                 &trans_dO, &trans_ind, t_out, k);
-
-      // Transpose back
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  x_grad, trans);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/top_k_v2_op_mlu.cc b/paddle/fluid/operators/top_k_v2_op_mlu.cc
index 5b8a6b3e75449..caaae02124c92 100644
--- a/paddle/fluid/operators/top_k_v2_op_mlu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_mlu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/top_k_v2_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc
index e11070638834c..dff5c2d3f3937 100644
--- a/paddle/fluid/operators/top_k_v2_op_npu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_npu.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/top_k_v2_op.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/top_k_v2_op_xpu.cc b/paddle/fluid/operators/top_k_v2_op_xpu.cc
index 49daac2ff0da6..4d9c39be92eff 100644
--- a/paddle/fluid/operators/top_k_v2_op_xpu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <memory>
 
-#include "paddle/fluid/operators/top_k_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "xpu/refactor/math.h"
 
diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc
index 0590b66f6f868..c6c0fa3c0019e 100644
--- a/paddle/fluid/operators/trace_op.cc
+++ b/paddle/fluid/operators/trace_op.cc
@@ -61,7 +61,7 @@ the 2-D planes specified by dim1 and dim2.
 )DOC");
   }
 };
-class TraceOpGrad : public framework::OperatorWithKernel {
+class TraceGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -114,7 +114,7 @@ REGISTER_OPERATOR(trace, ops::TraceOp, ops::TraceOpMaker,
                   ops::TraceGradOpMaker<paddle::imperative::OpBase>,
                   TraceInferShapeFunctor);
 
-REGISTER_OPERATOR(trace_grad, ops::TraceOpGrad,
+REGISTER_OPERATOR(trace_grad, ops::TraceGradOp,
                   ops::TraceGradNoNeedBufferVarsInferer);
 
 /* ==========================  register checkpoint ===========================*/
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
index 5617d728a51dc..fb39034c8e92c 100644
--- a/paddle/fluid/operators/transpose_op_npu_test.cc
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/triangular_solve_op.cc b/paddle/fluid/operators/triangular_solve_op.cc
index df84659a00f4c..35b925ca172b7 100644
--- a/paddle/fluid/operators/triangular_solve_op.cc
+++ b/paddle/fluid/operators/triangular_solve_op.cc
@@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/triangular_solve_op.h"
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/solve_op.h"
 #include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/triangular_solve_op.h b/paddle/fluid/operators/triangular_solve_op.h
deleted file mode 100644
index 315847b4d800e..0000000000000
--- a/paddle/fluid/operators/triangular_solve_op.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "glog/logging.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-#include "paddle/fluid/operators/solve_op.h"
-#include "paddle/fluid/operators/tril_triu_op.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-static void triangular_solve(const DeviceContext &context, const Tensor &x,
-                             const Tensor &y, Tensor *out, bool upper,
-                             bool transpose, bool unitriangular) {
-  // Tensor broadcast use eigen library
-  std::vector<int64_t> x_bst_dims_vec;
-  std::vector<int64_t> y_bst_dims_vec;
-  std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(x, y);
-
-  Tensor x_bst(x.type());
-  TensorExpand<T, DeviceContext>(context, x, &x_bst, x_bst_dims_vec);
-
-  Tensor y_bst(y.type());
-  TensorExpand<T, DeviceContext>(context, y, &y_bst, y_bst_dims_vec);
-
-  // TriangularSolveFunctor performs calculations in-place
-  // x_clone should be a copy of 'x' after broadcast
-  // out should be a copy of 'y' after broadcast
-  Tensor x_clone(x.type());
-  x_clone.Resize(phi::make_ddim(x_bst_dims_vec));
-  x_clone.mutable_data<T>(context.GetPlace());
-  framework::TensorCopy(x_bst, context.GetPlace(), context, &x_clone);
-
-  out->Resize(phi::make_ddim(y_bst_dims_vec));
-  out->mutable_data<T>(context.GetPlace());
-  framework::TensorCopy(y_bst, context.GetPlace(), context, out);
-
-  math::TriangularSolveFunctor<DeviceContext, T> functor;
-  functor(context, &x_clone, out, /*left=*/true, upper, transpose,
-          unitriangular);
-}
-
-template <typename DeviceContext, typename T>
-class MatrixReduceSumFunctor {
- public:
-  void operator()(const Tensor &input, Tensor *output,
-                  const framework::ExecutionContext &ctx);
-};
-
-template <typename T>
-class MatrixReduceSumFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const Tensor &in, Tensor *out,
-                  const framework::ExecutionContext &ctx) {
-    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
-    // out_reduce_dim should be [0, 2]
-    const std::vector<std::int64_t> in_dims = phi::vectorize(in.dims());
-    auto in_size = in_dims.size();
-    const std::vector<std::int64_t> out_dims = phi::vectorize(out->dims());
-    auto out_size = out_dims.size();
-
-    std::vector<std::int64_t> out_bst_dims(in_size);
-
-    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
-    std::copy(out_dims.data(), out_dims.data() + out_size,
-              out_bst_dims.data() + in_size - out_size);
-    out->Resize(phi::make_ddim(out_bst_dims));
-
-    std::vector<int> out_reduce_dims;
-    for (size_t idx = 0; idx <= in_size - 3; idx++) {
-      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
-        out_reduce_dims.push_back(idx);
-      }
-    }
-
-    ReduceKernelFunctor<platform::CPUDeviceContext, T, SumFunctor>(
-        &in, out, out_reduce_dims, true, false, ctx)
-        .template apply<T>();
-    out->Resize(phi::make_ddim(out_dims));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/tril_triu_op.cc b/paddle/fluid/operators/tril_triu_op.cc
index 3e943c62e1ce1..c8010e8a128e0 100644
--- a/paddle/fluid/operators/tril_triu_op.cc
+++ b/paddle/fluid/operators/tril_triu_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/tril_triu_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -104,19 +104,3 @@ REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker,
                   ops::TrilTriuGradOpMaker<paddle::framework::OpDesc>,
                   ops::TrilTriuGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp);
-REGISTER_OP_CPU_KERNEL(
-    tril_triu, ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, plat::float16>);
-REGISTER_OP_CPU_KERNEL(
-    tril_triu_grad,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext,
-                              plat::float16>);
diff --git a/paddle/fluid/operators/tril_triu_op.cu b/paddle/fluid/operators/tril_triu_op.cu
deleted file mode 100644
index 9cbbdeeb2ce28..0000000000000
--- a/paddle/fluid/operators/tril_triu_op.cu
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/tril_triu_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    tril_triu, ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    tril_triu_grad,
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>);
diff --git a/paddle/fluid/operators/tril_triu_op.h b/paddle/fluid/operators/tril_triu_op.h
deleted file mode 100644
index 3150b7617d10a..0000000000000
--- a/paddle/fluid/operators/tril_triu_op.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class TrilTriuCompute {
- public:
-  HOSTDEVICE TrilTriuCompute(const T* in, const int diagonal, const bool lower,
-                             const int64_t H, const int64_t W, T* out)
-      : in_(in), diagonal_(diagonal), lower_(lower), H_(H), W_(W), out_(out) {}
-
-  HOSTDEVICE void operator()(int64_t idx) {
-    const int64_t row = (idx / W_) % H_;
-    const int64_t col = idx % W_;
-    const bool mask =
-        lower_ ? (col - row > diagonal_) : (col - row < diagonal_);
-    out_[idx] = mask ? static_cast<T>(0) : in_[idx];
-  }
-
- private:
-  const T* in_;
-  const int diagonal_;
-  const bool lower_;
-  const int64_t H_;
-  const int64_t W_;
-  T* out_;
-};
-
-template <typename DeviceContext, typename T>
-class TrilTriuOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* x = context.Input<framework::Tensor>("X");
-    const auto* x_data = x->data<T>();
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-
-    const int diagonal = context.Attr<int>("diagonal");
-    const bool lower = context.Attr<bool>("lower");
-
-    const auto& dims = x->dims();
-    const auto H = dims[dims.size() - 2];
-    const auto W = dims[dims.size() - 1];
-
-    platform::ForRange<DeviceContext> for_range(
-        context.template device_context<DeviceContext>(),
-        static_cast<size_t>(x->numel()));
-
-    paddle::operators::TrilTriuCompute<T> tril_triu_computer(
-        x_data, diagonal, lower, H, W, out_data);
-    for_range(tril_triu_computer);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TrilTriuGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* d_out =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    const auto* dout_data = d_out->data<T>();
-    auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dx_data = d_x->mutable_data<T>(context.GetPlace());
-
-    const int diagonal = context.Attr<int>("diagonal");
-    const bool lower = context.Attr<bool>("lower");
-
-    const auto& dims = d_out->dims();
-    const auto H = dims[dims.size() - 2];
-    const auto W = dims[dims.size() - 1];
-
-    platform::ForRange<DeviceContext> for_range(
-        context.template device_context<DeviceContext>(),
-        static_cast<size_t>(d_out->numel()));
-
-    paddle::operators::TrilTriuCompute<T> tril_triu_grad_computer(
-        dout_data, diagonal, lower, H, W, dx_data);
-    for_range(tril_triu_grad_computer);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/tril_triu_op_npu.cc b/paddle/fluid/operators/tril_triu_op_npu.cc
index ad1c1814c05cd..4145730357d60 100644
--- a/paddle/fluid/operators/tril_triu_op_npu.cc
+++ b/paddle/fluid/operators/tril_triu_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/tril_triu_op_xpu.cc b/paddle/fluid/operators/tril_triu_op_xpu.cc
new file mode 100644
index 0000000000000..a44ea8ff689b8
--- /dev/null
+++ b/paddle/fluid/operators/tril_triu_op_xpu.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.  Licensed under
+the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class TrilTriuXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const auto* x = context.Input<framework::Tensor>("X");
+    const auto* x_data = x->data<T>();
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* out_data = out->mutable_data<T>(context.GetPlace());
+
+    const int diagonal = context.Attr<int>("diagonal");
+    const bool lower = context.Attr<bool>("lower");
+    auto xshape = phi::vectorize<int>(x->dims());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r = 0;
+    if (lower) {
+      r = xpu::tril(dev_ctx.x_context(), x_data, out_data, xshape, diagonal);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "tril_op");
+    } else {
+      r = xpu::triu(dev_ctx.x_context(), x_data, out_data, xshape, diagonal);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "triu_op");
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    tril_triu, ops::TrilTriuXPUKernel<paddle::platform::XPUDeviceContext, int>,
+    ops::TrilTriuXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 5ab2004617810..1be8f3387dbad 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -236,7 +236,6 @@ register_unity_group(cc
     scatter_nd_add_op.cc
     scatter_op.cc
     seed_op.cc
-    segment_pool_op.cc
     select_input_op.cc
     select_output_op.cc)
 register_unity_group(cc
@@ -496,8 +495,7 @@ register_unity_group(cu
     scale_op.cu
     scatter_nd_add_op.cu
     scatter_op.cu
-    seed_op.cu
-    segment_pool_op.cu)
+    seed_op.cu)
 register_unity_group(cu
     roi_pool_op.cu
     selu_op.cu
diff --git a/paddle/fluid/operators/unsqueeze_op.h b/paddle/fluid/operators/unsqueeze_op.h
index 7f676cbb65ee4..f6112fb59c122 100644
--- a/paddle/fluid/operators/unsqueeze_op.h
+++ b/paddle/fluid/operators/unsqueeze_op.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
index 3e11c952d15f3..a8ced783744a9 100644
--- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc
+++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/viterbi_decode_op.cc b/paddle/fluid/operators/viterbi_decode_op.cc
index bf1cdeed65a84..602376d54e0d2 100644
--- a/paddle/fluid/operators/viterbi_decode_op.cc
+++ b/paddle/fluid/operators/viterbi_decode_op.cc
@@ -9,8 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/viterbi_decode_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -19,47 +21,6 @@ class ViterbiDecodeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ViterbiDecode");
-    OP_INOUT_CHECK(ctx->HasInput("Transition"), "Input", "Transition",
-                   "ViterbiDecode");
-    OP_INOUT_CHECK(ctx->HasInput("Length"), "Input", "Length", "ViterbiDecode");
-    OP_INOUT_CHECK(ctx->HasOutput("Scores"), "Output", "Scores",
-                   "ViterbiDecode");
-    OP_INOUT_CHECK(ctx->HasOutput("Path"), "Output", "Path", "ViterbiDecode");
-    auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_EQ(in_dims.size(), 3,
-                      platform::errors::InvalidArgument(
-                          "The rank of Input in ViterbiDecode  must be 3. But "
-                          "received Input's rank is %d.",
-                          in_dims.size()));
-    auto length_dims = ctx->GetInputDim("Length");
-    PADDLE_ENFORCE_EQ(length_dims.size(), 1,
-                      platform::errors::InvalidArgument(
-                          "The rank of Length in ViterbiDecode must be 1. But "
-                          "received Length's rank is %d.",
-                          length_dims.size()));
-    auto transition_dims = ctx->GetInputDim("Transition");
-    PADDLE_ENFORCE_EQ(
-        transition_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "The rank of Transition in ViterbiDecode must be 2. But "
-            "received Transition's rank is %d.",
-            transition_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          in_dims[0], length_dims[0],
-          platform::errors::InvalidArgument(
-              "The batch size of Input and Length should be equal."));
-      PADDLE_ENFORCE_EQ(in_dims[2], transition_dims[0],
-                        platform::errors::InvalidArgument(
-                            "The number of tags of Input (%d) and Transition "
-                            "(%d) should be equal.",
-                            transition_dims[0], in_dims[2]));
-    }
-    ctx->SetOutputDim("Scores", length_dims);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -102,8 +63,8 @@ class ViterbiDecodeOpMaker : public framework::OpProtoAndCheckerMaker {
 
 namespace ops = paddle::operators;
 namespace platform = paddle::platform;
+DECLARE_INFER_SHAPE_FUNCTOR(viterbi_decode, ViterbiDecodeInferShapeFunctor,
+                            PD_INFER_META(phi::ViterbiDecodeInferMeta));
 REGISTER_OP_WITHOUT_GRADIENT(viterbi_decode, ops::ViterbiDecodeOp,
-                             ops::ViterbiDecodeOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    viterbi_decode, ops::ViterbiDecodeKernel<platform::CPUDeviceContext, float>,
-    ops::ViterbiDecodeKernel<platform::CPUDeviceContext, double>);
+                             ops::ViterbiDecodeOpMaker,
+                             ViterbiDecodeInferShapeFunctor);
diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu
deleted file mode 100644
index 68628fb2748c4..0000000000000
--- a/paddle/fluid/operators/viterbi_decode_op.cu
+++ /dev/null
@@ -1,206 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_functor.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/viterbi_decode_op.h"
-#include "paddle/phi/kernels/funcs/gather.cu.h"
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-namespace paddle {
-namespace operators {
-
-#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
-  case (1 << (log2_block_dim)): {                       \
-    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
-    __VA_ARGS__;                                        \
-  } break
-
-#define FIXED_BLOCK_DIM_CASE(...)               \
-  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
-
-int64_t ComputeBlockSize(int64_t col) {
-  if (col > 512)
-    return 1024;
-  else if (col > 256)
-    return 512;
-  else if (col > 128)
-    return 256;
-  else if (col > 64)
-    return 128;
-  else if (col > 32)
-    return 64;
-  else if (col > 16)
-    return 32;
-  else if (col > 8)
-    return 16;
-  else
-    return 8;
-}
-
-template <template <typename T> typename BinaryFunctor, typename T>
-struct BinaryOperation<platform::CUDADeviceContext, BinaryFunctor, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const framework::Tensor& lhs, const framework::Tensor& rhs,
-                  framework::Tensor* output) {
-    std::vector<const framework::Tensor*> ins{&lhs, &rhs};
-    std::vector<framework::Tensor*> outs{output};
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T>(dev_ctx, ins, &outs, -1,
-                                                      BinaryFunctor<T>());
-  }
-};
-
-template <template <typename InT, typename OutT> typename CompareFunctor,
-          typename T>
-struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& lhs, const framework::Tensor& rhs,
-                  framework::Tensor* mask) {
-    std::vector<const framework::Tensor*> ins = {&lhs, &rhs};
-    std::vector<framework::Tensor*> outs = {mask};
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
-        dev_ctx, ins, &outs, CompareFunctor<int64_t, T>());
-  }
-};
-
-template <typename T, typename IndType, size_t BlockDim>
-__global__ void ArgmaxCUDAKernel(const int64_t height,     // n * h
-                                 const int64_t width,      // c
-                                 const int64_t post_size,  // h
-                                 const T* in, IndType* out_idx, T* out) {
-  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  cub::ArgMax reducer;
-  T init = (std::numeric_limits<T>::lowest)();  // for windows compile
-  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
-    cub::KeyValuePair<int, T> kv_pair = {-1, init};
-    int h = idx / post_size;
-    int w = idx % post_size;
-    for (int k = threadIdx.x; k < width; k += blockDim.x) {
-      kv_pair =
-          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
-    }
-    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
-    if (threadIdx.x == 0) {
-      // return max, argmax
-      if (out_idx != nullptr) out_idx[idx] = static_cast<IndType>(kv_pair.key);
-      if (out != nullptr) out[idx] = kv_pair.value;
-    }
-    __syncthreads();
-  }
-}
-
-__global__ void ARangeKernel(int64_t* data, int num, int64_t scale) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int start = idx; idx < num; idx += gridDim.x) {
-    data[idx] = idx * scale;
-  }
-}
-
-template <>
-struct ARange<platform::CUDADeviceContext> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx, int64_t* data,
-                  int num, int64_t scale) {
-    int64_t kBlockDim = ComputeBlockSize(num);
-    // kBlockDim > num at most of time, so we can set grid = 1
-    ARangeKernel<<<1, kBlockDim, 0, dev_ctx.stream()>>>(data, num, scale);
-  }
-};
-
-template <typename T, typename IndType>
-struct Argmax<platform::CUDADeviceContext, T, IndType> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& input, framework::Tensor* out_idx,
-                  framework::Tensor* out, int axis) {
-    framework::DDim input_dims = input.dims();
-    int64_t numel = input.numel();
-    int64_t groups = numel / input_dims[axis];
-    int64_t pre = 1;
-    int64_t post = 1;
-    int64_t n = input_dims[axis];
-    for (int i = 0; i < axis; i++) {
-      pre *= input_dims[i];
-    }
-    for (int i = axis + 1; i < input_dims.size(); i++) {
-      post *= input_dims[i];
-    }
-    const auto& dev_ctx = ctx.cuda_device_context();
-    auto cu_stream = dev_ctx.stream();
-    int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
-    int64_t height = pre * post;
-    int64_t width = n;
-    int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
-    const T* in_data = input.data<T>();
-    IndType* out_idx_data = out_idx->data<IndType>();
-    T* out_data = out->data<T>();
-    switch (ComputeBlockSize(width)) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgmaxCUDAKernel<T, IndType,
-                           kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
-              height, width, post, in_data, out_idx_data, out_data));
-    }
-  }
-};
-
-template <typename T>
-struct GetMaxValue<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const framework::Tensor& input, T* max_value) {
-    framework::Tensor out_data;
-    out_data.Resize(phi::make_ddim({1}));
-    out_data.mutable_data<T>(platform::CUDAPlace());
-    switch (ComputeBlockSize(input.numel())) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgmaxCUDAKernel<T, T,
-                           kBlockDim><<<1, kBlockDim, 0, dev_ctx.stream()>>>(
-              1, input.numel(), 1, input.data<int64_t>(), nullptr,
-              out_data.data<int64_t>()));
-    }
-    framework::Tensor max_value_tensor;
-    framework::TensorCopy(out_data, platform::CPUPlace(), &max_value_tensor);
-    *max_value = max_value_tensor.data<T>()[0];
-  }
-};
-
-template <typename T, typename IndexT>
-struct Gather<platform::CUDADeviceContext, T, IndexT> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& src, const framework::Tensor& index,
-                  framework::Tensor* output) {
-    phi::funcs::GPUGather<T, IndexT>(ctx, src, index, output);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace platform = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    viterbi_decode,
-    ops::ViterbiDecodeKernel<platform::CUDADeviceContext, float>,
-    ops::ViterbiDecodeKernel<platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/viterbi_decode_op.h b/paddle/fluid/operators/viterbi_decode_op.h
deleted file mode 100644
index e7fe743b964c3..0000000000000
--- a/paddle/fluid/operators/viterbi_decode_op.h
+++ /dev/null
@@ -1,438 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_functor.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/operators/unique_op.h"
-#include "paddle/phi/kernels/funcs/compare_functors.h"
-#include "paddle/phi/kernels/funcs/gather.h"
-#ifdef PADDLE_WITH_MKLML
-#include <omp.h>
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T, typename IndType>
-struct Argmax {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& input, framework::Tensor* out_idx,
-                  framework::Tensor* out, int axis) {
-    framework::DDim input_dims = input.dims();
-    int64_t pre = 1;
-    int64_t post = 1;
-    int64_t n = input_dims[axis];
-    for (int i = 0; i < axis; i++) {
-      pre *= input_dims[i];
-    }
-    for (int i = axis + 1; i < input_dims.size(); i++) {
-      post *= input_dims[i];
-    }
-    int64_t height = pre * post;
-    int64_t width = n;
-    const T* in_data = input.data<T>();
-    IndType* out_idx_data = out_idx->data<IndType>();
-    T* out_data = out->data<T>();
-// Reduce
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-    for (int64_t i = 0; i < height; ++i) {
-      int64_t h = i / post;
-      int64_t w = i % post;
-      IndType max_idx = -1;
-      T max_value = (std::numeric_limits<T>::lowest)();  // for windows compile
-      for (int64_t j = 0; j < width; ++j) {
-        if (in_data[h * width * post + j * post + w] > max_value) {
-          max_value = in_data[h * width * post + j * post + w];
-          max_idx = j;
-        }
-      }
-      out_data[i] = max_value;
-      out_idx_data[i] = max_idx;
-    }
-  }
-};
-
-template <typename DeviceContext>
-struct ARange {
-  void operator()(const DeviceContext& dev_ctx, int64_t* data, int end,
-                  int64_t scale) {
-    for (int i = 0; i < end; ++i) {
-      data[i] = i * scale;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-struct GetMaxValue {
-  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& input,
-                  T* max_value) {
-    auto input_ptr = input.data<T>();
-    auto num = input.numel();
-    *max_value = *std::max_element(input_ptr, input_ptr + num);
-  }
-};
-
-template <typename DeviceContext, typename T, typename IndexT = int>
-struct Gather {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& src,
-                  const framework::Tensor& index, framework::Tensor* output) {
-    phi::funcs::CPUGather<T, IndexT>(ctx, src, index, output);
-  }
-};
-
-template <typename T, typename Functor, typename OutT = T>
-void SameDimsBinaryOP(const framework::Tensor& lhs,
-                      const framework::Tensor& rhs, framework::Tensor* out) {
-  const T* lhs_ptr = lhs.data<T>();
-  const T* rhs_ptr = rhs.data<T>();
-  OutT* out_ptr = out->data<OutT>();
-  Functor functor;
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (int i = 0; i < out->numel(); ++i) {
-    out_ptr[i] = functor(lhs_ptr[i], rhs_ptr[i]);
-  }
-}
-
-template <typename DeviceContext,
-          template <typename InT, typename OutT> typename CompareFunctor,
-          typename T>
-struct GetMask {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& lhs, const framework::Tensor& rhs,
-                  framework::Tensor* mask) {
-    SameDimsBinaryOP<int64_t, CompareFunctor<int64_t, T>, T>(lhs, rhs, mask);
-  }
-};
-
-template <bool is_multi_threads>
-struct GetInputIndex {
-  void operator()(const std::vector<int>& lhs_dims,
-                  const std::vector<int>& rhs_dims,
-                  const std::vector<int>& output_dims,
-                  const std::vector<int>& lhs_strides,
-                  const std::vector<int>& rhs_strides,
-                  const std::vector<int>& output_strides, int output_idx,
-                  int* index_array, int* lhs_idx, int* rhs_idx) {
-    int out_dims_size = output_strides.size();
-    for (int j = 0; j < out_dims_size; ++j) {
-      int curr_idx = output_idx / output_strides[j];
-      output_idx %= output_strides[j];
-      *lhs_idx += (lhs_dims[j] > 1) ? curr_idx * lhs_strides[j] : 0;
-      *rhs_idx += (rhs_dims[j] > 1) ? curr_idx * rhs_strides[j] : 0;
-    }
-  }
-};
-
-template <>
-struct GetInputIndex<false> {
-  void operator()(const std::vector<int>& lhs_dims,
-                  const std::vector<int>& rhs_dims,
-                  const std::vector<int>& output_dims,
-                  const std::vector<int>& lhs_strides,
-                  const std::vector<int>& rhs_strides,
-                  const std::vector<int>& output_strides, int output_idx,
-                  int* index_array, int* lhs_idx, int* rhs_idx) {
-    int out_dims_size = output_strides.size();
-    *lhs_idx = phi::funcs::GetElementwiseIndex(lhs_dims.data(), out_dims_size,
-                                               index_array);
-    *rhs_idx = phi::funcs::GetElementwiseIndex(rhs_dims.data(), out_dims_size,
-                                               index_array);
-    phi::funcs::UpdateElementwiseIndexArray(output_dims.data(), out_dims_size,
-                                            index_array);
-  }
-};
-
-template <typename T, typename Functor, bool is_multi_threads = false>
-void SimpleBroadcastBinaryOP(const framework::Tensor& lhs,
-                             const framework::Tensor& rhs,
-                             framework::Tensor* out) {
-  const T* lhs_ptr = lhs.data<T>();
-  const T* rhs_ptr = rhs.data<T>();
-  T* out_ptr = out->data<T>();
-  int out_size = static_cast<int>(out->dims().size());
-  std::vector<int> out_dims(out_size);
-  std::vector<int> lhs_dims(out_size);
-  std::vector<int> rhs_dims(out_size);
-  std::copy(lhs.dims().Get(), lhs.dims().Get() + out_size, lhs_dims.data());
-  std::copy(rhs.dims().Get(), rhs.dims().Get() + out_size, rhs_dims.data());
-  std::copy(out->dims().Get(), out->dims().Get() + out_size, out_dims.data());
-  std::vector<int> output_strides(out_size, 1);
-  std::vector<int> lhs_strides(out_size, 1);
-  std::vector<int> rhs_strides(out_size, 1);
-  std::vector<int> index_array(out_size, 0);
-  // calculate strides
-  for (int i = out_size - 2; i >= 0; --i) {
-    output_strides[i] = output_strides[i + 1] * out_dims[i + 1];
-    lhs_strides[i] = lhs_strides[i + 1] * lhs_dims[i + 1];
-    rhs_strides[i] = rhs_strides[i + 1] * rhs_dims[i + 1];
-  }
-  Functor functor;
-  GetInputIndex<is_multi_threads> get_input_index;
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (int i = 0; i < out->numel(); ++i) {
-    int lhs_idx = 0;
-    int rhs_idx = 0;
-    get_input_index(lhs_dims, rhs_dims, out_dims, lhs_strides, rhs_strides,
-                    output_strides, i, index_array.data(), &lhs_idx, &rhs_idx);
-    out_ptr[i] = functor(lhs_ptr[lhs_idx], rhs_ptr[rhs_idx]);
-  }
-}
-
-template <typename DeviceContext, template <typename T> typename BinaryFunctor,
-          typename T>
-struct BinaryOperation {
-  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& lhs,
-                  const framework::Tensor& rhs, framework::Tensor* output) {
-    if (lhs.dims() == rhs.dims()) {
-      SameDimsBinaryOP<T, BinaryFunctor<T>>(lhs, rhs, output);
-    } else {
-      bool is_multi_threads = false;
-#ifdef PADDLE_WITH_MKLML
-      if (omp_get_max_threads() > 1) {
-        is_multi_threads = true;
-      }
-#endif
-      if (is_multi_threads) {
-        SimpleBroadcastBinaryOP<T, BinaryFunctor<T>, true>(lhs, rhs, output);
-      } else {
-        SimpleBroadcastBinaryOP<T, BinaryFunctor<T>, false>(lhs, rhs, output);
-      }
-    }
-  }
-};
-
-class TensorBuffer {
- public:
-  explicit TensorBuffer(const framework::LoDTensor& in)
-      : buffer_(in), offset_(0) {
-    buffer_.Resize({buffer_.numel()});
-  }
-  framework::Tensor GetBufferBlock(std::initializer_list<int64_t> shape) {
-    int64_t size = std::accumulate(shape.begin(), shape.end(), 1,
-                                   std::multiplies<int64_t>());
-    framework::Tensor block = buffer_.Slice(offset_, offset_ + size);
-    offset_ += size;
-    block.Resize(shape);
-    return block;
-  }
-
- private:
-  framework::LoDTensor buffer_;  // need to resize 1-D Tensor
-  int offset_;
-};
-
-template <typename DeviceContext, typename T>
-class ViterbiDecodeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    bool include_bos_eos_tag = ctx.Attr<bool>("include_bos_eos_tag");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto curr_place = ctx.GetPlace();
-    auto* input = ctx.Input<framework::Tensor>("Input");
-    auto batch_size = static_cast<int>(input->dims()[0]);
-    auto seq_len = static_cast<int>(input->dims()[1]);
-    auto n_labels = static_cast<int>(input->dims()[2]);
-    phi::funcs::SetConstant<DeviceContext, T> float_functor;
-    phi::funcs::SetConstant<DeviceContext, int64_t> int_functor;
-    std::vector<framework::Tensor> historys;
-    // We create tensor buffer in order to avoid allocating memory frequently
-    // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
-    int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
-    framework::LoDTensor int_buffer;
-    int_buffer.Resize(phi::make_ddim({buffer_size}));
-    int_buffer.mutable_data<int64_t>(ctx.GetPlace());
-    TensorBuffer int_tensor_buffer(int_buffer);
-    // create float tensor buffer
-    // 10 means allocate 10*batch_size*n_labels bytes, such as alpha, alpha_max
-    buffer_size = batch_size * (seq_len + 10) * n_labels +
-                  (batch_size + 2) * n_labels * n_labels;
-    framework::LoDTensor float_buffer;
-    float_buffer.Resize(phi::make_ddim({buffer_size}));
-    float_buffer.mutable_data<T>(ctx.GetPlace());
-    TensorBuffer float_tensor_buffer(float_buffer);
-    auto* length = ctx.Input<framework::Tensor>("Length");
-    framework::Tensor left_length =
-        int_tensor_buffer.GetBufferBlock({batch_size, 1});
-    framework::TensorCopy(*length, curr_place, dev_ctx, &left_length);
-    int64_t max_seq_len = 0;
-    GetMaxValue<DeviceContext, int64_t> get_max_value;
-    get_max_value(dev_ctx, left_length, &max_seq_len);
-
-    auto* scores = ctx.Output<framework::Tensor>("Scores");
-    scores->mutable_data<T>(curr_place);
-    auto* path = ctx.Output<framework::Tensor>("Path");
-    path->Resize({batch_size, max_seq_len});
-    path->mutable_data<int64_t>(curr_place);
-    framework::Tensor tpath =
-        int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
-    auto batch_path = Unbind(tpath);
-    for (auto it = batch_path.begin(); it != batch_path.end(); ++it) {
-      it->Resize({batch_size});
-    }
-    // create and init required tensor
-    framework::Tensor input_exp =
-        float_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
-    TransCompute<DeviceContext, T>(3, dev_ctx, *input, &input_exp, {1, 0, 2});
-    auto* transition = ctx.Input<framework::Tensor>("Transition");
-    framework::Tensor trans_exp =
-        float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
-    framework::TensorCopy(*transition, curr_place, dev_ctx, &trans_exp);
-    trans_exp.Resize({1, n_labels, n_labels});
-    framework::Tensor alpha =
-        float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    framework::Tensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
-    int_functor(dev_ctx, &zero, 0);
-    framework::Tensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
-    int_functor(dev_ctx, &one, 1);
-    framework::Tensor float_one =
-        float_tensor_buffer.GetBufferBlock({batch_size, 1});
-    float_functor(dev_ctx, &float_one, static_cast<T>(1.0));
-    framework::Tensor alpha_trn_sum =
-        float_tensor_buffer.GetBufferBlock({batch_size, n_labels, n_labels});
-    framework::Tensor alpha_max =
-        float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    framework::Tensor alpha_argmax =
-        int_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
-    auto alpha_argmax_unbind = Unbind(alpha_argmax);
-    framework::Tensor alpha_nxt =
-        float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    framework::Tensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
-    framework::Tensor zero_len_mask =
-        int_tensor_buffer.GetBufferBlock({batch_size});
-    framework::Tensor float_mask =
-        float_tensor_buffer.GetBufferBlock({batch_size, 1});
-    framework::Tensor stop_trans =
-        float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
-    framework::Tensor start_trans =
-        float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
-    framework::Tensor rest_trans =
-        float_tensor_buffer.GetBufferBlock({1, n_labels - 2, n_labels});
-    framework::Tensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
-    framework::Tensor last_ids_tmp =
-        int_tensor_buffer.GetBufferBlock({batch_size});
-    framework::Tensor batch_offset =
-        int_tensor_buffer.GetBufferBlock({batch_size});
-    framework::Tensor gather_idx =
-        int_tensor_buffer.GetBufferBlock({batch_size});
-    std::vector<const framework::Tensor*> shape{&rest_trans, &stop_trans,
-                                                &start_trans};
-    std::vector<framework::Tensor*> outputs{&rest_trans, &stop_trans,
-                                            &start_trans};
-    math::SplitFunctor<DeviceContext, T> split_functor;
-    split_functor(dev_ctx, trans_exp, shape, 1, &outputs);
-    stop_trans.Resize({1, n_labels});
-    start_trans.Resize({1, n_labels});
-    auto logit0 = input_exp.Slice(0, 1);
-    logit0.Resize({batch_size, n_labels});
-    BinaryOperation<DeviceContext, AddFunctor, T> AddFloat;
-    BinaryOperation<DeviceContext, AddFunctor, int64_t> AddInt;
-    BinaryOperation<DeviceContext, MulFunctor, T> MulFloat;
-    BinaryOperation<DeviceContext, MulFunctor, int64_t> MulInt;
-    BinaryOperation<DeviceContext, SubFunctor, T> SubFloat;
-    BinaryOperation<DeviceContext, SubFunctor, int64_t> SubInt;
-    if (include_bos_eos_tag) {
-      AddFloat(dev_ctx, logit0, start_trans, &alpha);
-      GetMask<DeviceContext, phi::funcs::EqualFunctor, T>()(ctx, left_length,
-                                                            one, &float_mask);
-      MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
-      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
-    } else {
-      alpha = logit0;
-    }
-    SubInt(dev_ctx, left_length, one, &left_length);
-    Argmax<DeviceContext, T, int64_t> argmax;
-    for (int64_t i = 1; i < max_seq_len; ++i) {
-      framework::Tensor logit = input_exp.Slice(i, i + 1);
-      logit.Resize({batch_size, n_labels});
-      framework::Tensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
-      AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
-      auto alpha_argmax_temp = alpha_argmax_unbind[i - 1];
-      alpha_argmax_temp.Resize({batch_size, n_labels});
-      argmax(ctx, alpha_trn_sum, &alpha_argmax_temp, &alpha_max, 1);
-      historys.emplace_back(alpha_argmax_temp);
-      AddFloat(dev_ctx, alpha_max, logit, &alpha_nxt);
-      alpha.Resize({batch_size, n_labels});
-      // mask = paddle.cast((left_length > 0), dtype='float32')
-      // alpha = mask * alpha_nxt + (1 - mask) * alpha
-      GetMask<DeviceContext, phi::funcs::GreaterThanFunctor, T>()(
-          ctx, left_length, zero, &float_mask);
-      // alpha_nxt = mask * alpha_nxt
-      MulFloat(dev_ctx, alpha_nxt, float_mask, &alpha_nxt);
-      // inv_mask = 1 - mask
-      SubFloat(dev_ctx, float_one, float_mask, &float_mask);
-      // alpha = (1 - mask) * alpha
-      MulFloat(dev_ctx, alpha, float_mask, &alpha);
-      // alpha += alpha_nxt
-      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
-      if (include_bos_eos_tag) {
-        GetMask<DeviceContext, phi::funcs::EqualFunctor, T>()(ctx, left_length,
-                                                              one, &float_mask);
-        // alpha += mask * trans_exp[:, self.stop_idx]
-        MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
-        AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
-      }
-      SubInt(dev_ctx, left_length, one, &left_length);
-    }
-    argmax(ctx, alpha, &last_ids, scores, 1);
-    left_length.Resize({batch_size});
-    GetMask<DeviceContext, phi::funcs::GreaterEqualFunctor, int64_t>()(
-        ctx, left_length, zero, &int_mask);
-    // last_ids_update = last_ids * tag_mask
-    int last_ids_index = 1;
-    int actual_len = (std::min)(seq_len, static_cast<int>(max_seq_len));
-    MulInt(dev_ctx, last_ids, int_mask,
-           &batch_path[actual_len - last_ids_index]);
-    // The algorithm below can refer to
-    // https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/layers/crf.py#L438
-    ARange<DeviceContext> arange;
-    arange(dev_ctx, batch_offset.data<int64_t>(), batch_size, n_labels);
-    Gather<DeviceContext, int64_t, int64_t> gather;
-    for (auto hist = historys.rbegin(); hist != historys.rend(); ++hist) {
-      ++last_ids_index;
-      AddInt(dev_ctx, left_length, one, &left_length);
-      AddInt(dev_ctx, batch_offset, last_ids, &gather_idx);
-      framework::Tensor& last_ids_update =
-          batch_path[actual_len - last_ids_index];
-      hist->Resize({batch_size * n_labels});
-      gather(dev_ctx, *hist, gather_idx, &last_ids_update);
-      GetMask<DeviceContext, phi::funcs::GreaterThanFunctor, int64_t>()(
-          ctx, left_length, zero, &int_mask);
-      MulInt(dev_ctx, last_ids_update, int_mask, &last_ids_update);
-      GetMask<DeviceContext, phi::funcs::EqualFunctor, int64_t>()(
-          ctx, left_length, zero, &zero_len_mask);
-      MulInt(dev_ctx, last_ids, zero_len_mask, &last_ids_tmp);
-      SubInt(dev_ctx, one, zero_len_mask, &zero_len_mask);
-      MulInt(dev_ctx, last_ids_update, zero_len_mask, &last_ids_update);
-      AddInt(dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update);
-      GetMask<DeviceContext, phi::funcs::LessThanFunctor, int64_t>()(
-          ctx, left_length, zero, &int_mask);
-      MulInt(dev_ctx, last_ids, int_mask, &last_ids);
-      AddInt(dev_ctx, last_ids_update, last_ids, &last_ids);
-    }
-    TransCompute<DeviceContext, int64_t>(2, dev_ctx, tpath, path, {1, 0});
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/where_index_op.cc b/paddle/fluid/operators/where_index_op.cc
index 2bffeb500ce50..733d0f7af92d7 100644
--- a/paddle/fluid/operators/where_index_op.cc
+++ b/paddle/fluid/operators/where_index_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/where_index_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,16 +24,6 @@ class WhereIndexOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Condition"), "Input", "Condition", "where");
-    PADDLE_ENFORCE_GE(
-        ctx->GetInputDim("Condition").size(), 1UL,
-        platform::errors::InvalidArgument(
-            "Input(Condition) should have number of dimension at least 1"));
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "where");
-    ctx->SetOutputDim("Out", {-1, ctx->GetInputDim("Condition").size()});
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -53,11 +46,10 @@ class WhereIndexOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(where_index, ops::WhereIndexOp,
-                             ops::WhereIndexOpMaker);
-REGISTER_OP_CPU_KERNEL(where_index, ops::CPUWhereIndexKernel<int64_t>,
-                       ops::CPUWhereIndexKernel<int>,
-                       ops::CPUWhereIndexKernel<int16_t>,
-                       ops::CPUWhereIndexKernel<bool>,
-                       ops::CPUWhereIndexKernel<float>,
-                       ops::CPUWhereIndexKernel<double>);
+DECLARE_INFER_SHAPE_FUNCTOR(where_index, WhereIndexInferShapeFunctor,
+                            PD_INFER_META(phi::WhereIndexInferMeta));
+REGISTER_OPERATOR(
+    where_index, ops::WhereIndexOp, ops::WhereIndexOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    WhereIndexInferShapeFunctor);
diff --git a/paddle/fluid/operators/where_index_op.cu b/paddle/fluid/operators/where_index_op.cu
deleted file mode 100644
index c594e478aa0f3..0000000000000
--- a/paddle/fluid/operators/where_index_op.cu
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/where_index_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/core/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;
-
-template <typename T>
-__global__ void GetTrueNum(const T *cond_data, const int64_t numel,
-                           int64_t *true_num_array) {
-  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
-    true_num_array[idx] =
-        static_cast<int64_t>(static_cast<bool>(cond_data[idx]));
-  }
-}
-
-template <typename T>
-__global__ void SetTrueIndex(int64_t *out_ptr, const T *cond_data,
-                             const int64_t numel, const int64_t *stride_array,
-                             const int64_t rank,
-                             const int64_t *true_num_array) {
-  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
-    // true_num_array is calculated by cub::InclusiveSum,
-    // cause the first element of true_num_array is 1,
-    // so we need substract 1 to get true index.
-    const int64_t true_index = true_num_array[idx] - 1;
-    if (static_cast<bool>(cond_data[idx])) {
-      int64_t rank_index = idx;
-      for (int j = 0; j < rank; j++) {
-        const int64_t out_index = rank_index / stride_array[j];
-        out_ptr[true_index * rank + j] = out_index;
-        rank_index -= out_index * stride_array[j];
-      }
-    }
-  }
-}
-
-template <typename T>
-class CUDAWhereIndexKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *condition = context.Input<framework::Tensor>("Condition");
-    auto *out = context.Output<framework::Tensor>("Out");
-    auto &dev_ctx = context.template device_context<CUDADeviceContext>();
-
-    const T *cond_data = condition->data<T>();
-    const int64_t numel = condition->numel();
-    auto dims = condition->dims();
-    const int rank = dims.size();
-
-    auto d_array_mem = memory::Alloc(dev_ctx, (numel + rank) * sizeof(int64_t));
-    auto h_array_mem =
-        memory::Alloc(platform::CPUPlace(), (rank + 1) * sizeof(int64_t));
-
-    // "stride_array" is an array and len(stride_array)==rank,
-    // each element is the stride of each dimension -- the length from i to i+1.
-    int64_t *h_stride_array = reinterpret_cast<int64_t *>(h_array_mem->ptr());
-    int64_t *d_stride_array = reinterpret_cast<int64_t *>(d_array_mem->ptr());
-
-    // "true_num_array" is an array and len(stride_array)==numel,
-    // at the beginning,
-    // "true_num_array" will set 1 if condition[i] == true else 0,
-    // then it will be calculated by cub::InclusiveSum,
-    // so that we can get the true number before i as the out index
-    int64_t *d_true_num_array = d_stride_array + rank;
-
-    // the total_true_num is the total number of condition[i] == true
-    int64_t *h_total_true_num = h_stride_array + rank;
-
-    // alloce cub memory
-    size_t cub_size = 0;
-    cub::DeviceScan::InclusiveSum(nullptr, cub_size, d_true_num_array,
-                                  d_true_num_array, numel, dev_ctx.stream());
-    auto cub_mem = memory::Alloc(dev_ctx, cub_size * sizeof(int64_t));
-    void *cub_data = cub_mem->ptr();
-
-    // set d_true_num_array[i]=1 if cond_data[i]==true else 0
-    const int threads = std::min(numel, static_cast<int64_t>(128));
-    const int64_t need_grids = (numel + threads - 1) / threads;
-    const int grids = std::min(need_grids, static_cast<int64_t>(256));
-    GetTrueNum<T><<<grids, threads, 0, dev_ctx.stream()>>>(cond_data, numel,
-                                                           d_true_num_array);
-
-    // calculate the inclusive prefix sum of "true_num_array"
-    // to get the index of "out" tensor,
-    // and the total number of cond_data[i]==true.
-    // Example:
-    // condition: F T T F F F T T
-    // before:    0 1 1 0 0 0 1 1
-    // after:     0 1 2 2 2 2 3 4
-    // out:       1 2 6 7
-    cub::DeviceScan::InclusiveSum(cub_data, cub_size, d_true_num_array,
-                                  d_true_num_array, numel, dev_ctx.stream());
-
-    // calculate each dimension's stride
-    h_stride_array[rank - 1] = 1;
-    for (int i = rank - 2; i >= 0; i--) {
-      h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1];
-    }
-    memory::Copy(dev_ctx.GetPlace(), d_stride_array, platform::CPUPlace(),
-                 h_stride_array, rank * sizeof(int64_t), dev_ctx.stream());
-
-    // get total ture number and set output size
-    // the last element of cub::InclusiveSum is the total number
-    memory::Copy(platform::CPUPlace(), h_total_true_num, dev_ctx.GetPlace(),
-                 d_true_num_array + numel - 1, sizeof(int64_t),
-                 dev_ctx.stream());
-    dev_ctx.Wait();
-
-    int64_t true_num = *h_total_true_num;
-    out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
-    auto out_data = out->mutable_data<int64_t>(context.GetPlace());
-
-    if (true_num == 0) {
-      return;
-    }
-
-    // using true_num_array and stride_array to calculate the output index
-    SetTrueIndex<T><<<grids, threads, 0, dev_ctx.stream()>>>(
-        out_data, cond_data, numel, d_stride_array, rank, d_true_num_array);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(where_index, ops::CUDAWhereIndexKernel<int64_t>,
-                        ops::CUDAWhereIndexKernel<int>,
-                        ops::CUDAWhereIndexKernel<int16_t>,
-                        ops::CUDAWhereIndexKernel<bool>,
-                        ops::CUDAWhereIndexKernel<float>,
-                        ops::CUDAWhereIndexKernel<double>);
diff --git a/paddle/fluid/operators/where_index_op.h b/paddle/fluid/operators/where_index_op.h
deleted file mode 100644
index 193a2386e6bd1..0000000000000
--- a/paddle/fluid/operators/where_index_op.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct WhereIndexFunctor {
-  WhereIndexFunctor(const T* true_index, int true_num, const T* stride,
-                    int rank, T* out)
-      : true_index_(true_index),
-        true_num_(true_num),
-        stride_(stride),
-        rank_(rank),
-        out_ptr_(out) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    T index = true_index_[idx];
-    for (int j = 0; j < rank_; j++) {
-      out_ptr_[idx * rank_ + j] = index / stride_[j];
-      index -= out_ptr_[idx * rank_ + j] * stride_[j];
-    }
-  }
-
-  const T* true_index_;
-  int true_num_;
-  const T* stride_;
-  int rank_;
-  T* out_ptr_;
-};
-
-using CPUDeviceContext = paddle::platform::CPUDeviceContext;
-
-template <typename T>
-class CPUWhereIndexKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    const T* cond_data = condition->data<T>();
-    auto numel = condition->numel();
-    auto dims = condition->dims();
-    const int rank = dims.size();
-
-    std::vector<int64_t> true_index;
-    for (auto i = 0; i < numel; i++) {
-      if (static_cast<bool>(cond_data[i])) {
-        true_index.push_back(i);
-      }
-    }
-    auto true_num = true_index.size();
-
-    out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
-    auto out_ptr = out->mutable_data<int64_t>(context.GetPlace());
-
-    if (true_num == 0) {
-      return;
-    }
-
-    std::vector<int64_t> stride(rank);
-    stride[rank - 1] = 1;
-    for (int i = rank - 2; i >= 0; i--) {
-      stride[i] = stride[i + 1] * dims[i + 1];
-    }
-
-    auto& dev_ctx = context.template device_context<CPUDeviceContext>();
-    WhereIndexFunctor<int64_t> functor(true_index.data(), true_num,
-                                       stride.data(), rank, out_ptr);
-    platform::ForRange<CPUDeviceContext> for_range(dev_ctx, true_num);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/where_index_op_npu.cc b/paddle/fluid/operators/where_index_op_npu.cc
index 59f598d2ad6a3..2f8744c2c0448 100644
--- a/paddle/fluid/operators/where_index_op_npu.cc
+++ b/paddle/fluid/operators/where_index_op_npu.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/where_index_op.h"
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/where_index_op_xpu.cc b/paddle/fluid/operators/where_index_op_xpu.cc
deleted file mode 100644
index 3322eefd887e3..0000000000000
--- a/paddle/fluid/operators/where_index_op_xpu.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/fluid/operators/where_index_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class WhereIndexXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    const T* cond_data = condition->data<T>();
-    auto numel = condition->numel();
-    auto dims = condition->dims();
-    const int rank = dims.size();
-
-    auto& dev_ctx =
-        context.template device_context<paddle::platform::XPUDeviceContext>();
-    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-    int* true_num = RAII_GUARD.alloc_l3_or_gm<int32_t>(1);
-    int true_num_cpu;
-    int ret =
-        xpu::nonzero_count(dev_ctx.x_context(), cond_data, true_num, numel);
-    PADDLE_ENFORCE_EQ(
-        ret, XPU_SUCCESS,
-        platform::errors::External(
-            "XPU nonzero_count kernel return wrong value[%d %s] in WhereIndex",
-            ret, XPUAPIErrorMsg[ret]));
-
-    memory::Copy(platform::CPUPlace(), static_cast<void*>(&true_num_cpu),
-                 context.GetPlace(), static_cast<void*>(true_num),
-                 sizeof(int32_t));
-
-    out->Resize(phi::make_ddim({static_cast<int64_t>(true_num_cpu), rank}));
-    auto out_data = out->mutable_data<int64_t>(context.GetPlace());
-    if (true_num_cpu == 0) {
-      return;
-    }
-
-    auto condition_shape = phi::vectorize<int>(dims);
-    ret = xpu::where(dev_ctx.x_context(), cond_data, out_data, condition_shape,
-                     true_num_cpu);
-    PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU masked_select kernel return wrong value[%d %s]",
-                          ret, XPUAPIErrorMsg[ret]));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(where_index, ops::WhereIndexXPUKernel<int>,
-                       ops::WhereIndexXPUKernel<bool>,
-                       ops::WhereIndexXPUKernel<float>);
-#endif
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 04c8a329e5e1a..de09860fd26d5 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -117,7 +117,7 @@ endif()
 cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 
 # seperate init from device_context to avoid cycle dependencies
-cc_library(init SRCS init.cc DEPS device_context custom_kernel)
+cc_library(init SRCS init.cc DEPS device_context custom_kernel context_pool)
 
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt
index acf914c5087d0..42c949f7fe0f6 100644
--- a/paddle/fluid/platform/device/ipu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
@@ -13,7 +13,7 @@ IF(WITH_IPU)
     "ipu_device.cc"
   )
 
-  cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart-only graph graph_helper)
+  cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart-only graph graph_helper popdist)
   cc_library(ipu_info SRCS ${IPU_INFO_SRC} DEPS popart-only enforce)
   add_library(paddle_ipu SHARED ${PADDLE_IPU_SRC})
   add_dependencies(paddle_ipu ipu_backend)
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc
index e0b3b08a2313d..012294d0fff85 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc
@@ -32,6 +32,7 @@ IpuBackend* IpuBackend::GetInstance() {
 IpuBackend::IpuBackend() {
   compiler_ = std::make_unique<Compiler>();
   executor_ = std::make_unique<Executor>();
+  timer_ = std::make_unique<platform::Timer>();
 }
 
 IpuBackend::~IpuBackend() {
@@ -43,6 +44,7 @@ void IpuBackend::Compile(Graph* graph,
                          const std::vector<std::string>& feed_list,
                          const std::vector<std::string>& fetch_list) {
   VLOG(10) << "enter IpuBackend::Compile";
+  is_compiled_ = false;
   compiler_->Prepare(graph);
   compiler_->InitInputs(feed_list);
   compiler_->LowerConstants(scope_);
@@ -52,31 +54,25 @@ void IpuBackend::Compile(Graph* graph,
   if (ipu_strategy_->is_training) {
     compiler_->LowerOptimizer(scope_);
   }
+  if (!ipu_strategy_->onnx_dump_path.empty()) {
+    SaveModelProto(ipu_strategy_->onnx_dump_path);
+  }
   executor_->SetCompilerResources(compiler_->GetResources());
-
+  executor_->Prepare(compiler_->GetModelProto());
   is_compiled_ = true;
-  // when call compile, means a new graph
-  is_prepared_ = false;
   VLOG(10) << "leave IpuBackend::Compile";
 }
 
 void IpuBackend::Run(const std::vector<const Tensor*>& inputs,
                      const std::vector<Tensor*>& outputs,
                      const framework::ExecutionContext& ctx) {
-  Prepare();
   timer_->Start();
   executor_->Run(inputs, outputs, ctx);
   timer_->Pause();
   VLOG(10) << "[IPU Run]: " << timer_->ElapsedMS() << " (ms)";
 }
 
-void IpuBackend::Prepare() {
-  if (!is_prepared_) {
-    executor_->Prepare(compiler_->GetModelProto());
-    timer_.reset(new platform::Timer());
-    is_prepared_ = true;
-  }
-}
+void IpuBackend::WeightsToHost() { executor_->WeightsToHost(); }
 
 void IpuBackend::Detach() { executor_->Detach(); }
 
@@ -101,12 +97,10 @@ void IpuBackend::SetIpuStrategy(const IpuStrategy& strategy) {
 }
 
 void IpuBackend::SaveModelProto(const std::string& path) {
-  if (ipu_strategy_->is_training && is_prepared_) {
+  if (ipu_strategy_->is_training && is_compiled_) {
     executor_->SaveModelToHost(path);
-  } else if (is_compiled_) {
-    compiler_->SaveModelProtoNoCheck(path);
   } else {
-    LOG(WARNING) << "Model is empty";
+    compiler_->SaveModelProtoNoCheck(path);
   }
 }
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.h b/paddle/fluid/platform/device/ipu/ipu_backend.h
index 1244192490c16..0578d9face675 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.h
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.h
@@ -60,6 +60,9 @@ class IpuBackend {
            const std::vector<Tensor *> &outputs,
            const framework::ExecutionContext &ctx);
 
+  // Sync weights from IPU while training
+  void WeightsToHost();
+
   // detach IPU manually
   void Detach();
 
@@ -76,22 +79,17 @@ class IpuBackend {
   void SaveModelProto(const std::string &path);
 
  private:
-  void Prepare();
-
- private:
-  std::unique_ptr<Compiler> compiler_;
-  std::unique_ptr<Executor> executor_;
-  bool is_compiled_ = false;
-  bool is_prepared_ = false;
-
   // not own
   const Scope *scope_ = nullptr;
   const IpuStrategy *ipu_strategy_ = nullptr;
 
- private:
-  // time record for IpuBackend::Run
+  // own
+  std::unique_ptr<Compiler> compiler_;
+  std::unique_ptr<Executor> executor_;
   std::unique_ptr<platform::Timer> timer_;
 
+  bool is_compiled_ = false;
+
   DISABLE_COPY_AND_ASSIGN(IpuBackend);
 };
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
index cdb3f6f9b3e28..1a3e600058b3b 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
@@ -18,6 +18,7 @@
 #include <popart/adaptive.hpp>
 #include <popart/optimizer.hpp>
 #include <popart/sgd.hpp>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 
@@ -25,13 +26,20 @@ namespace paddle {
 namespace platform {
 namespace ipu {
 
-popart::AdamMode AdamModeFromStr(const std::string& str) {
+popart::AdamMode AdamModeFromStr(const std::string& str,
+                                 const bool& use_no_bias_optimizer) {
   if (str == "adam") {
-    return popart::AdamMode::Adam;
+    if (!use_no_bias_optimizer)
+      return popart::AdamMode::Adam;
+    else
+      return popart::AdamMode::AdamNoBias;
   } else if (str == "adamax") {
     return popart::AdamMode::AdaMax;
   } else if (str == "lamb") {
-    return popart::AdamMode::Lamb;
+    if (!use_no_bias_optimizer)
+      return popart::AdamMode::Lamb;
+    else
+      return popart::AdamMode::LambNoBias;
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Uknown AdamMode: %s, AdamMode must be one of these values: adam, "
@@ -70,6 +78,17 @@ popart::WeightDecayMode WeightDecayModeFromStr(const std::string& str) {
   }
 }
 
+popart::DataType DataTypeFromStr(const std::string& str) {
+  if (str == "FLOAT") {
+    return popart::DataType::FLOAT;
+  } else if (str == "FLOAT16") {
+    return popart::DataType::FLOAT16;
+  } else {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("Unsupported DataType: %s", str));
+  }
+}
+
 template <typename T>
 T GetAttrAllowNull(std::string attr, OpDesc* op_desc) {
   if (op_desc->HasAttr(attr)) {
@@ -122,6 +141,17 @@ void Compiler::Prepare(const Graph* graph) {
   builder_ = popart::Builder::create();
   resources_ = std::make_unique<CompilerResources>();
   graph_helper_ = std::make_unique<GraphHelper>(graph);
+  // Set the flag of set_amp_for_all_
+  for (auto* node : graph_helper_->sorted_ops) {
+    auto* op_desc = node->Op();
+    auto op_type = op_desc->Type();
+    if (op_type == "popart_matmul") {
+      if (op_desc->HasAttr(sAvailMemAttribute)) {
+        set_amp_for_all_ = false;
+        return;
+      }
+    }
+  }
 }
 
 void Compiler::RegisterOpFunc() {
@@ -155,7 +185,9 @@ void Compiler::RegisterOpFunc() {
      auto debug_context = BuildDebugContext(op_desc);         \
      auto aiGraphcoreOpset = builder_->aiGraphcoreOpset1();   \
      auto aiOnnxOpset = builder_->aiOnnxOpset11();            \
+     PushNameScope(op_desc);                                  \
      auto output_ids = OnnxImpl(inputs Args, debug_context);  \
+     PopNameScope(op_desc);                                   \
      SetIpuIndexStage(output_ids, op_desc);                   \
      SetAMPAttributes(output_ids, op_desc);                   \
      SetSerializeAttributes(output_ids, op_desc);             \
@@ -241,7 +273,9 @@ void Compiler::LowerConstants(const Scope* scope) {
       popart::TensorInfo tensor_info(PdDataType2PopartType(tensor->dtype()),
                                      shape);
       const_data.reset(new popart::ConstVoidData(tensor->data(), tensor_info));
+      PushNameScope(op_desc);
       popart::TensorId result = builder_->aiOnnxOpset11().constant(*const_data);
+      PopNameScope(op_desc);
       SetIpuIndexStage(result, op_desc);
       resources_->tensors.emplace(tensor_name, result);
     }
@@ -261,6 +295,10 @@ void Compiler::LowerWeights(const Scope* scope) {
           VLOG(10) << "found existed one, skip lowering Weight: " << var_name;
           continue;
         }
+        if (var_name.rfind("learning_rate", 0) == 0) {
+          VLOG(10) << "skip learning_rate_var: " << var_name;
+          continue;
+        }
         VLOG(10) << "lowering weight: " << var_name;
 
         auto var = scope->FindVar(var_name);
@@ -273,10 +311,15 @@ void Compiler::LowerWeights(const Scope* scope) {
           }
           popart::TensorInfo tensor_info(dtype, shape);
           popart::ConstVoidData const_data{tensor.data(), tensor_info};
-          popart::TensorId result =
-              builder_->addInitializedInputTensor(const_data, var_name);
-          resources_->tensors.emplace(var_name, result);
-          resources_->weights.push_back(result);
+          if (!node->outputs.empty()) {
+            auto op_node = node->outputs[0];
+            PushNameScope(op_node->Op());
+            popart::TensorId result =
+                builder_->addInitializedInputTensor(const_data, var_name);
+            PopNameScope(op_node->Op());
+            resources_->tensors.emplace(var_name, result);
+            resources_->weights.push_back(var_name);
+          }
         }
       }
     }
@@ -298,7 +341,10 @@ void Compiler::LowerBody() {
     } else if (op_type == "popart_checkpointoutput") {
       auto inputs = GetOpInputs(op_desc);
       auto outputs = GetOpOutputs(op_desc);
+      PushNameScope(op_desc);
       auto output_ids = builder_->checkpointOutput(inputs);
+      PopNameScope(op_desc);
+      SetIpuIndexStage(output_ids, op_desc);
       InsertTensors(outputs, output_ids);
     } else if (op_type == "popart_custom_op") {
       auto inputs = GetOpInputs(op_desc);
@@ -313,9 +359,11 @@ void Compiler::LowerBody() {
           BOOST_GET_CONST(std::string, op_desc->GetAttr("__op_type"));
       VLOG(10) << "Build graph from custom op: " << __op_type;
       auto it = custom_ops_.find(__op_type);
+      PushNameScope(op_desc);
       auto output_ids =
           builder_->customOp(it->second.popart_op, it->second.popart_op.version,
                              inputs, outputs.size(), attributes, debug_context);
+      PopNameScope(op_desc);
       SetIpuIndexStage(output_ids, op_desc);
       InsertTensors(outputs, output_ids);
     } else if (op_type == "popart_printtensor") {
@@ -325,8 +373,10 @@ void Compiler::LowerBody() {
       auto print_gradient =
           BOOST_GET_CONST(int64_t, op_desc->GetAttr("print_gradient"));
       auto title = BOOST_GET_CONST(std::string, op_desc->GetAttr("title"));
+      PushNameScope(op_desc);
       auto output_ids = builder_->aiGraphcoreOpset1().printtensor(
           inputs, print_gradient, debug_context, title);
+      PopNameScope(op_desc);
       SetIpuIndexStage(output_ids, op_desc);
       InsertTensors(outputs, output_ids);
     } else {
@@ -367,8 +417,31 @@ void Compiler::LowerOptimizer(const Scope* scope) {
         resources_->with_lr_sched = false;
       }
       VLOG(10) << "Set initial lr: " << resources_->lr;
-      auto loss_scaling = ipu_strategy_->loss_scaling;
+
+      // Get the type of optimizer
       auto type = BOOST_GET_CONST(std::string, op_desc->GetAttr("type"));
+      // Set weight decay by tensor names for Lamb
+      auto weight_decay_vars = BOOST_GET_CONST(
+          std::vector<std::string>, op_desc->GetAttr("weight_decay_vars"));
+      auto weight_decay_values = BOOST_GET_CONST(
+          std::vector<float>, op_desc->GetAttr("weight_decay_values"));
+      // Get the maximum permissible value for gradient clipping
+      std::vector<popart::ClipNormSettings> clip_norm_settings = {};
+      if (op_desc->HasAttr("clip_norm")) {
+        auto clip_norm = BOOST_GET_CONST(float, op_desc->GetAttr("clip_norm"));
+        clip_norm_settings.push_back(
+            popart::ClipNormSettings::clipAllWeights(clip_norm));
+        VLOG(10) << "Set the global gradient clipping with the maximum "
+                    "permissible value: "
+                 << clip_norm;
+      }
+
+      // Values from ipu_strategy
+      auto loss_scaling = ipu_strategy_->loss_scaling;
+      auto accl1_type = DataTypeFromStr(ipu_strategy_->accl1_type);
+      auto accl2_type = DataTypeFromStr(ipu_strategy_->accl2_type);
+      auto accl3_type = DataTypeFromStr(ipu_strategy_->accl3_type);
+
       if (type == "sgd") {
         auto weight_decay =
             BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay"));
@@ -376,12 +449,18 @@ void Compiler::LowerOptimizer(const Scope* scope) {
         resources_->optimizer_fn = [=](float lr) {
           return std::make_unique<popart::SGD>(
               popart::OptimizerValue(lr, false),
-              popart::OptimizerValue(weight_decay, true),
+              popart::OptimizerValue(weight_decay, false),
               popart::OptimizerValue(momentum, true),
               popart::SGD::getUnsetDampening(),
               popart::SGD::getUnsetVelocityScaling(),
-              popart::OptimizerValue(loss_scaling, true));
+              popart::OptimizerValue(loss_scaling, true), clip_norm_settings);
         };
+        resources_->eval_optimizer = std::make_unique<popart::SGD>(
+            popart::OptimizerValue(0.0, false),
+            popart::OptimizerValue(0.0, false),
+            popart::OptimizerValue(0.0, true), popart::SGD::getUnsetDampening(),
+            popart::SGD::getUnsetVelocityScaling(),
+            popart::OptimizerValue(loss_scaling, true), clip_norm_settings);
       } else if (type == "adam") {
         auto weight_decay =
             BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay"));
@@ -392,22 +471,79 @@ void Compiler::LowerOptimizer(const Scope* scope) {
         VLOG(10) << "set max_weight_norm: " << mwn;
         auto adam_mode_ =
             BOOST_GET_CONST(std::string, op_desc->GetAttr("adam_mode"));
-        auto adam_mode = AdamModeFromStr(adam_mode_);
-        auto weight_decay_mode_ =
-            BOOST_GET_CONST(std::string, op_desc->GetAttr("weight_decay_mode"));
+        auto adam_mode =
+            AdamModeFromStr(adam_mode_, ipu_strategy_->use_no_bias_optimizer);
+        auto weight_decay_mode_ = ipu_strategy_->weight_decay_mode;
+        if (weight_decay_mode_.empty()) {
+          weight_decay_mode_ = BOOST_GET_CONST(
+              std::string, op_desc->GetAttr("weight_decay_mode"));
+        }
         auto weight_decay_mode = WeightDecayModeFromStr(weight_decay_mode_);
         resources_->optimizer_fn = [=](float lr) {
-          return std::make_unique<popart::Adam>(
-              popart::OptimizerValue(lr, false),
-              popart::OptimizerValue(weight_decay, true),
-              popart::OptimizerValue(beta1, true),
-              popart::OptimizerValue(beta2, true),
+          if (adam_mode == popart::AdamMode::Lamb ||
+              adam_mode == popart::AdamMode::LambNoBias) {
+            const std::map<std::string, std::pair<float, bool>>
+                optimizer_value = {{"defaultLearningRate", {lr, false}},
+                                   {"defaultBeta1", {beta1, false}},
+                                   {"defaultBeta2", {beta2, false}},
+                                   {"defaultEps", {eps, true}},
+                                   {"lossScaling", {loss_scaling, true}},
+                                   {"defaultMaxWeightNorm", {mwn, true}}};
+            auto optimizer_instance = std::make_unique<popart::Adam>(
+                optimizer_value, adam_mode, weight_decay_mode,
+                popart::DataType::UNDEFINED, accl1_type, accl2_type,
+                clip_norm_settings);
+            for (int i = 0; i < weight_decay_vars.size(); i++) {
+              optimizer_instance->insertSpecific(
+                  weight_decay_vars[i],
+                  {{"weightDecay", {weight_decay_values[i], false}}});
+              VLOG(10) << "Set Tensor " << weight_decay_vars[i]
+                       << " weight decay as " << weight_decay_values[i];
+            }
+            return optimizer_instance;
+          } else {
+            return std::make_unique<popart::Adam>(
+                popart::OptimizerValue(lr, false),
+                popart::OptimizerValue(weight_decay, false),
+                popart::OptimizerValue(beta1, false),
+                popart::OptimizerValue(beta2, false),
+                popart::OptimizerValue(eps, true),
+                popart::OptimizerValue(loss_scaling, true),
+                popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode,
+                popart::DataType::UNDEFINED, accl1_type, accl2_type,
+                clip_norm_settings);
+          }
+        };
+        if (adam_mode == popart::AdamMode::Lamb ||
+            adam_mode == popart::AdamMode::LambNoBias) {
+          const std::map<std::string, std::pair<float, bool>> optimizer_value =
+              {{"defaultLearningRate", {0.0, false}},
+               {"defaultBeta1", {beta1, false}},
+               {"defaultBeta2", {beta2, false}},
+               {"defaultEps", {eps, true}},
+               {"lossScaling", {loss_scaling, true}},
+               {"defaultMaxWeightNorm", {mwn, true}}};
+          auto eval_optimizer = std::make_unique<popart::Adam>(
+              optimizer_value, adam_mode, weight_decay_mode,
+              popart::DataType::UNDEFINED, popart::DataType::FLOAT,
+              popart::DataType::FLOAT, clip_norm_settings);
+          for (int i = 0; i < weight_decay_vars.size(); i++) {
+            eval_optimizer->insertSpecific(weight_decay_vars[i],
+                                           {{"weightDecay", {0.0, false}}});
+          }
+          resources_->eval_optimizer = std::move(eval_optimizer);
+        } else {
+          resources_->eval_optimizer = std::make_unique<popart::Adam>(
+              popart::OptimizerValue(0.0, false),
+              popart::OptimizerValue(0.0, false),
+              popart::OptimizerValue(beta1, false),
+              popart::OptimizerValue(beta2, false),
               popart::OptimizerValue(eps, true),
               popart::OptimizerValue(loss_scaling, true),
               popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode,
               popart::DataType::UNDEFINED, popart::DataType::FLOAT,
-              popart::DataType::FLOAT);
-        };
+              popart::DataType::FLOAT, clip_norm_settings);
+        }
       } else if (type == "adaptive") {
         auto alpha = BOOST_GET_CONST(float, op_desc->GetAttr("alpha"));
         auto momentum = BOOST_GET_CONST(float, op_desc->GetAttr("momentum"));
@@ -417,21 +553,33 @@ void Compiler::LowerOptimizer(const Scope* scope) {
         auto adaptive_mode_ =
             BOOST_GET_CONST(std::string, op_desc->GetAttr("adaptive_mode"));
         auto adaptive_mode = AdaptiveModeFromStr(adaptive_mode_);
-        auto weight_decay_mode_ =
-            BOOST_GET_CONST(std::string, op_desc->GetAttr("weight_decay_mode"));
+        auto weight_decay_mode_ = ipu_strategy_->weight_decay_mode;
+        if (weight_decay_mode_.empty()) {
+          weight_decay_mode_ = BOOST_GET_CONST(
+              std::string, op_desc->GetAttr("weight_decay_mode"));
+        }
         auto weight_decay_mode = WeightDecayModeFromStr(weight_decay_mode_);
         resources_->optimizer_fn = [=](float lr) {
           return std::make_unique<popart::Adaptive>(
               popart::OptimizerValue(lr, false),
-              popart::OptimizerValue(weight_decay, true),
+              popart::OptimizerValue(weight_decay, false),
               popart::OptimizerValue(alpha, true),
               popart::OptimizerValue(momentum, true),
               popart::OptimizerValue(eps, true),
               popart::OptimizerValue(loss_scaling, true), adaptive_mode,
-              weight_decay_mode, popart::DataType::UNDEFINED,
-              popart::DataType::FLOAT, popart::DataType::FLOAT,
-              popart::DataType::FLOAT);
+              weight_decay_mode, popart::DataType::UNDEFINED, accl1_type,
+              accl2_type, accl3_type);
         };
+        resources_->eval_optimizer = std::make_unique<popart::Adaptive>(
+            popart::OptimizerValue(0.0, false),
+            popart::OptimizerValue(0.0, false),
+            popart::OptimizerValue(alpha, true),
+            popart::OptimizerValue(momentum, true),
+            popart::OptimizerValue(eps, true),
+            popart::OptimizerValue(loss_scaling, true), adaptive_mode,
+            weight_decay_mode, popart::DataType::UNDEFINED,
+            popart::DataType::FLOAT, popart::DataType::FLOAT,
+            popart::DataType::UNDEFINED);
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "optimizer %s is not implemented", type));
@@ -510,9 +658,32 @@ void Compiler::SetAMPAttributes(const std::string& tensor_id,
                                 const OpDesc* op_desc) {
   VLOG(10) << "enter Compiler::SetAMPAttributes";
   if (op_desc->Type() == "popart_matmul") {
-    auto amp = ipu_strategy_->available_memory_proportion;
-    if (amp > 0.0f && amp <= 1.0) {
-      builder_->setAvailableMemoryProportion(tensor_id, amp);
+    if (set_amp_for_all_) {
+      auto amp = ipu_strategy_->available_memory_proportion;
+      if (amp < 0.0f || amp > 1.0) {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "AvailableMemoryProportion %f is invalid, which should be set 0 <= "
+            "amp <= 1",
+            amp));
+      }
+      if (amp > 0.0f) {
+        builder_->setAvailableMemoryProportion(tensor_id, amp);
+      }
+    } else {
+      if (op_desc->HasAttr(sAvailMemAttribute)) {
+        auto amp = BOOST_GET_CONST(float, op_desc->GetAttr(sAvailMemAttribute));
+        if (amp < 0.0f || amp > 1.0) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "AvailableMemoryProportion %f is invalid, which should be set 0 "
+              "<= amp <= 1",
+              amp));
+        }
+        if (amp > 0.0f) {
+          builder_->setAvailableMemoryProportion(tensor_id, amp);
+          VLOG(10) << "set available_memory_proportion for tensor: "
+                   << tensor_id << " as " << amp;
+        }
+      }
     }
   }
   VLOG(10) << "leave Compiler::SetAMPAttributes";
@@ -602,6 +773,29 @@ popart::DebugContext Compiler::BuildDebugContext(const OpDesc* op) {
   return popart::DebugContext(op_identify_id);
 }
 
+void Compiler::PushNameScope(const OpDesc* op) {
+  auto op_namescope = BOOST_GET_CONST(std::string, op->GetAttr(sOpNamescope));
+  if (op_namescope == "/") {
+    return;
+  }
+  if (!op_namescope.empty()) {
+    op_namescope.pop_back();
+  }
+  if (!op_namescope.empty()) {
+    op_namescope.erase(op_namescope.begin());
+  }
+  VLOG(10) << "name_scope is: " << op_namescope;
+  builder_->pushNameScope(op_namescope);
+}
+
+void Compiler::PopNameScope(const OpDesc* op) {
+  auto op_namescope = BOOST_GET_CONST(std::string, op->GetAttr(sOpNamescope));
+  if (op_namescope == "/") {
+    return;
+  }
+  builder_->popNameScope();
+}
+
 }  // namespace ipu
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.h b/paddle/fluid/platform/device/ipu/ipu_compiler.h
index 5d1e8c2727d8f..2d00970bf1297 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.h
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.h
@@ -50,6 +50,8 @@ struct CompilerResources {
   using OptimizerFn =
       std::function<std::unique_ptr<popart::Optimizer>(float lr)>;
   OptimizerFn optimizer_fn;
+  // The eval mode of optimizer in training
+  std::unique_ptr<popart::Optimizer> eval_optimizer;
 
  public:
   popart::Optimizer *Optimizer() { return optimizer.get(); }
@@ -110,6 +112,7 @@ class Compiler {
   void RegisterOpFunc();
   std::vector<std::string> GetOpInputs(const OpDesc *op);
   const std::vector<std::string> &GetOpOutputs(const OpDesc *op);
+  const std::string GetNameScope(const OpDesc *op);
   popart::DebugContext BuildDebugContext(const OpDesc *op);
 
   void InsertTensors(const std::vector<std::string> &output_names,
@@ -126,6 +129,8 @@ class Compiler {
                               const OpDesc *op_desc);
   void SetSerializeAttributes(const std::string &tensor_id,
                               const OpDesc *op_desc);
+  void PushNameScope(const OpDesc *op);
+  void PopNameScope(const OpDesc *op);
 
  private:
   std::unique_ptr<popart::Builder> builder_;
@@ -137,6 +142,14 @@ class Compiler {
 
   const IpuStrategy *ipu_strategy_ = nullptr;
   std::map<std::string, IpuCustomOpIdentifier> custom_ops_;
+
+  // Used to choose the way to set amp for Ops
+  // If anyone op has the attr sAvailMemAttribute, the
+  // available_memory_proportion from ipu_strategy
+  // will be ignored and the Ops are set by their own sAvailMemAttribute. Else,
+  // all relevant Ops will be set by
+  // the available_memory_proportion from ipu_strategy.
+  bool set_amp_for_all_ = true;
 };
 
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc
index c124d58957fe6..649b291244110 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc
@@ -64,15 +64,10 @@ void Executor::Prepare(const std::string &proto) {
   WeightsFromPaddle();
   VLOG(10) << "Copy weights from paddle to popart...done";
 
-  VLOG(10) << "Copy weights from host to device...";
-  session_->weightsFromHost();
-  VLOG(10) << "Copy weights from host to device...done";
-
-  if (ipu_strategy_->save_init_onnx) {
-    session_->modelToHost("test_init.onnx");
+  if (ipu_strategy_->random_seed != std::numeric_limits<std::uint64_t>::max()) {
+    VLOG(10) << "Setting random seed to: " << ipu_strategy_->random_seed;
+    session_->setRandomSeed(ipu_strategy_->random_seed);
   }
-  // init run step
-  step_ = 0;
 }
 
 void Executor::Run(const std::vector<const Tensor *> &inputs,
@@ -120,11 +115,17 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
   VLOG(10) << "Prepared inputs/anchors";
 
   if (ipu_strategy_->is_training && compiler_resources_->with_lr_sched) {
-    VLOG(10) << "Update learning_rate";
-    auto new_lr =
-        GetSingleVarFromScope<float>(scope_, compiler_resources_->lr_var);
-    VLOG(10) << "New Lr: " << new_lr;
-    auto *optimizer = compiler_resources_->UpdateOptimizer(new_lr);
+    popart::Optimizer *optimizer;
+    if (ipu_strategy_->runtime_options.enable_eval) {
+      VLOG(10) << "Switch optimizer to eval mode";
+      optimizer = compiler_resources_->eval_optimizer.get();
+    } else {
+      VLOG(10) << "Update learning_rate";
+      auto new_lr =
+          GetSingleVarFromScope<float>(scope_, compiler_resources_->lr_var);
+      VLOG(10) << "New Lr: " << new_lr;
+      optimizer = compiler_resources_->UpdateOptimizer(new_lr);
+    }
     auto *session = dynamic_cast<popart::TrainingSession *>(session_.get());
     session->updateOptimizerFromHost(optimizer);
   }
@@ -133,15 +134,13 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
   VLOG(10) << "Running...";
   session_->run(stepio);
   VLOG(10) << "Running...done";
+}
 
-  step_++;
-  if (ipu_strategy_->is_training &&
-      step_ % ipu_strategy_->save_per_n_step == 0) {
-    session_->weightsToHost();
+void Executor::WeightsToHost() {
+  if (ipu_strategy_->is_training && session_) {
     WeightsToPaddle();
-    if (ipu_strategy_->save_onnx_checkpoint) {
-      session_->modelToHost("test_last" + std::to_string(step_) + ".onnx");
-    }
+  } else {
+    LOG(WARNING) << "For a non-trainning graph, cannot sync weights from IPU.";
   }
 }
 
@@ -153,6 +152,7 @@ void Executor::AcquireDevice() {
   }
 
   bool use_ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
+  bool enable_distribution = ipu_strategy_->enable_distribution;
   if (use_ipu_model) {
     std::map<std::string, std::string> deviceOpts{
         {
@@ -162,6 +162,16 @@ void Executor::AcquireDevice() {
     };
     device_ = popart::DeviceManager::createDeviceManager().createIpuModelDevice(
         deviceOpts);
+  } else if (enable_distribution) {
+    auto ipus_per_replica = ipu_strategy_->num_ipus /
+                            ipu_strategy_->popart_options.replicatedGraphCount;
+    auto device_id = popdist_get_device(ipus_per_replica);
+    device_ = popart::DeviceManager::createDeviceManager().acquireDeviceById(
+        device_id);
+    PADDLE_ENFORCE_NOT_NULL(
+        device_, platform::errors::Unavailable(
+                     "Can't attach IPU in distribution, ipu_num = %d.",
+                     RequestIpus(ipu_strategy_->num_ipus)));
   } else {
     device_ =
         popart::DeviceManager::createDeviceManager().acquireAvailableDevice(
@@ -185,28 +195,29 @@ void Executor::SetWeightsIO() {
   auto opt_type = compiler_resources_->optimizer_type;
   VLOG(10) << "SetWeightsIO for " << opt_type;
   auto pre_post_fix = GetOptPrePostfix(opt_type);
-  for (const auto &weight_id : compiler_resources_->weights) {
+  for (const auto &weight_pd : compiler_resources_->weights) {
     for (const auto &pair : pre_post_fix) {
       // pair.first : popart prefix, pair.second : paddle postfix
-      auto popart_var_name = pair.first + weight_id;
-      auto paddle_var_name = weight_id + pair.second;
+      auto weight_pop = compiler_resources_->tensors[weight_pd];
+      auto popart_var = pair.first + weight_pop;
+      auto paddle_var = weight_pd + pair.second;
 
-      if (scope_->FindVar(paddle_var_name) == nullptr) {
+      if (scope_->FindVar(paddle_var) == nullptr) {
         continue;
       }
-
-      if (!session_->hasInfo(popart_var_name)) {
+      if (!session_->hasInfo(popart_var)) {
         continue;
       }
 
-      auto var = scope_->GetVar(paddle_var_name);
+      VLOG(10) << "Connect paddle weight: " << paddle_var
+               << " with popart weight: " << popart_var;
+      auto var = scope_->GetVar(paddle_var);
       auto data_ptr = var->GetMutable<framework::LoDTensor>()->data();
-
-      auto tensor_info = session_->getInfo(popart_var_name);
-      executor_resources_->weights_io.insert(popart_var_name,
+      auto tensor_info = session_->getInfo(popart_var);
+      executor_resources_->weights_io.insert(popart_var,
                                              {data_ptr, tensor_info});
       executor_resources_->weights_and_opt_state.emplace_back(
-          std::make_pair(popart_var_name, paddle_var_name));
+          std::make_pair(popart_var, paddle_var));
     }
   }
 }
@@ -284,6 +295,7 @@ void Executor::ConvertWeights(bool align_to_popart) {
 void Executor::WeightsFromPaddle() {
   ConvertWeights(true);
   session_->writeWeights(executor_resources_->weights_io);
+  session_->weightsFromHost();
 }
 
 // |-----------------------------------------------------|
@@ -297,13 +309,13 @@ void Executor::WeightsFromPaddle() {
 // Paddle -> halfToFloat: cast then save to paddle
 // Popart -> Paddle: copy from paddle to popart
 void Executor::WeightsToPaddle() {
+  session_->weightsToHost();
   session_->readWeights(executor_resources_->weights_io);
   ConvertWeights(false);
 }
 
 void Executor::SaveModelToHost(const std::string &path) {
   if (session_) {
-    session_->weightsToHost();
     WeightsToPaddle();
     session_->modelToHost(path);
   } else {
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.h b/paddle/fluid/platform/device/ipu/ipu_executor.h
index b08b94b45ff65..c59e623ab20b0 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.h
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <popart/patterns/patterns.hpp>
 #include <popart/session.hpp>
 #include <popart/tensorinfo.hpp>
+#include <popdist/popdist_poplar.hpp>
 
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
@@ -36,8 +37,7 @@ struct ExecutorResources {
   // map<tensor_id, paddle_var_ptr>
   popart::WeightsIO weights_io;
   // <popart_var, paddle_var> pairs, include weights and optimizer states
-  std::vector<std::pair<popart::TensorId, popart::TensorId>>
-      weights_and_opt_state;
+  std::vector<std::pair<popart::TensorId, std::string>> weights_and_opt_state;
 };
 
 class Executor {
@@ -53,14 +53,12 @@ class Executor {
            const std::vector<Tensor *> &outputs,
            const framework::ExecutionContext &ctx);
 
+  // sync weights from popart to paddle
+  void WeightsToHost();
+
   // detach IPU
   void Detach();
 
-  void SetWeightsIO();
-  void ConvertWeights(bool align_to_popart);
-  void WeightsFromPaddle();
-  void WeightsToPaddle();
-
   // Scope
   void SetScope(const Scope *scope) { scope_ = scope; }
 
@@ -79,6 +77,10 @@ class Executor {
 
  private:
   void AcquireDevice();
+  void SetWeightsIO();
+  void ConvertWeights(bool);
+  void WeightsFromPaddle();
+  void WeightsToPaddle();
 
  private:
   // not own
@@ -92,8 +94,6 @@ class Executor {
   std::unique_ptr<popart::Session> session_;
   // one OneSession means a graph
   std::unique_ptr<ExecutorResources> executor_resources_;
-
-  int step_ = 0;
 };
 
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/ipu_names.h b/paddle/fluid/platform/device/ipu/ipu_names.h
index a809a8c6e5bcc..b8a6ceffb5c15 100644
--- a/paddle/fluid/platform/device/ipu/ipu_names.h
+++ b/paddle/fluid/platform/device/ipu/ipu_names.h
@@ -24,6 +24,8 @@ static constexpr const char *sIpuIndexAttr = "ipu_index";
 static constexpr const char *sIpuStageAttr = "ipu_stage";
 static constexpr const char *sMatmulSerializeFactor = "serialize_factor";
 static constexpr const char *sMatmulSerializeMode = "serialize_mode";
+static constexpr const char *sAvailMemAttribute = "__available_memory";
+static constexpr const char *sOpNamescope = "op_namescope";
 static constexpr const char *sOpIdentifyIdAttr = "op_identify_id";
 static constexpr const char *sDebugInfoId = "__debug_info_id";
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
index e806b0b30e4e0..6172d4d7dc680 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -62,23 +62,40 @@ IpuStrategy::IpuStrategy() {
                  [&]() { return name; })
 
   ADD_BOOL_OPTION(is_training);
-  ADD_BOOL_OPTION(save_init_onnx);
-  ADD_BOOL_OPTION(save_onnx_checkpoint);
   ADD_BOOL_OPTION(need_avg_shard);
   ADD_BOOL_OPTION(enable_fp16);
+  ADD_BOOL_OPTION(transfer_cast_op);
+  ADD_BOOL_OPTION(use_no_bias_optimizer);
+  ADD_BOOL_OPTION(enable_distribution);
   ADD_UINT64_OPTION(num_ipus);
   ADD_UINT64_OPTION(batches_per_step);
   ADD_UINT64_OPTION(micro_batch_size);
-  ADD_UINT64_OPTION(save_per_n_step);
+  ADD_UINT64_OPTION(random_seed);
   ADD_DOUBLE_OPTION(available_memory_proportion);
   ADD_DOUBLE_OPTION(loss_scaling);
   ADD_DOUBLE_OPTION(max_weight_norm);
+  ADD_STRING_OPTION(accl1_type);
+  ADD_STRING_OPTION(accl2_type);
+  ADD_STRING_OPTION(accl3_type);
+  ADD_STRING_OPTION(onnx_dump_path);
+  ADD_STRING_OPTION(weight_decay_mode);
 
 #undef ADD_STRING_OPTION
 #undef ADD_DOUBLE_OPTION
 #undef ADD_UINT64_OPTION
 #undef ADD_BOOL_OPTION
 
+#define ADD_RUNTIME_BOOL_OPTION(name, aliased_name)                          \
+  RegisterSetter(bool_options, #name,                                        \
+                 [&](bool value) { runtime_options.aliased_name = value; }); \
+  RegisterGetter(options_getter, options_type, #name, "bool", [&]() {        \
+    return std::to_string(runtime_options.aliased_name);                     \
+  })
+
+  ADD_RUNTIME_BOOL_OPTION(runtime_options.enable_eval, enable_eval);
+
+#undef ADD_RUNTIME_BOOL_OPTION
+
 #define ADD_POPART_ENUM_OPTION_ALIAS(name, aliased_name, EnumType)        \
   RegisterSetter(uint64_options, #name, [&](std::uint64_t value) {        \
     PADDLE_ENFORCE_LT(                                                    \
@@ -171,6 +188,7 @@ IpuStrategy::IpuStrategy() {
   ADD_POPART_UINT64_OPTION_ALIAS(merge_var_update_mem_threshold,
                                  mergeVarUpdateMemThreshold);
   ADD_POPART_UINT64_OPTION_ALIAS(loose_threshold_at_peak, looseThresholdAtPeak);
+  ADD_POPART_UINT64_OPTION_ALIAS(replicated_graph_count, replicatedGraphCount);
   ADD_POPART_UINT64_OPTION_ALIAS(accumulation_factor, accumulationFactor);
   ADD_POPART_UINT64_OPTION_ALIAS(swap_limit_scheduler, swapLimitScheduler);
   ADD_POPART_UINT64_OPTION_ALIAS(global_replication_factor,
@@ -462,12 +480,30 @@ void IpuStrategy::SetTensorLocation(const std::string& tensor,
   } else if (opt == "use_io_tiles_to_store") {
     settings->location.storageTileSet =
         value > 0 ? popart::TileSet::IO : popart::TileSet::Compute;
+  } else if (opt == "sharding_domain_with_all") {
+    settings->location.shardingDomain =
+        popart::CommGroup(popart::CommGroupType::All, value);
+  } else if (opt == "sharding_domain_with_consecutive") {
+    settings->location.shardingDomain =
+        popart::CommGroup(popart::CommGroupType::Consecutive, value);
+  } else if (opt == "sharding_domain_with_orthogonal") {
+    settings->location.shardingDomain =
+        popart::CommGroup(popart::CommGroupType::Orthogonal, value);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Unknown option ' %s' for tensor location: %s", opt, tensor));
   }
 }
 
+void IpuStrategy::SetAccumulateOuterFragmentSettings(
+    const std::uint64_t& schedule, const std::vector<int>& values) {
+  VLOG(10) << "SetAccumulateOuterFragmentSettings schedule:" << schedule;
+  auto schedule_ =
+      static_cast<popart::AccumulateOuterFragmentSchedule>(schedule);
+  popart_options.accumulateOuterFragmentSettings =
+      popart::AccumulateOuterFragmentSettings(schedule_, values);
+}
+
 void IpuStrategy::AddCustomOp(const std::string& paddle_op,
                               const std::string& popart_op,
                               const std::string& domain, int version) {
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index 571fb1e163718..786e2419cc0be 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -24,6 +24,11 @@ namespace paddle {
 namespace platform {
 namespace ipu {
 
+struct RuntimeOptions {
+  // enable the eval mode in training by switching optimizers.
+  bool enable_eval = false;
+};
+
 class IpuStrategy {
  public:
   IpuStrategy();
@@ -32,19 +37,24 @@ class IpuStrategy {
   // training flag, true for training
   bool is_training = true;
 
-  // save the onnx model lowered by paddle program description
-  bool save_init_onnx = false;
-
-  // save the trained model
-  bool save_onnx_checkpoint = false;
-
   // average sharding, debugging used
   bool need_avg_shard = false;
 
   // flag for fp16, true for pure fp16
   bool enable_fp16 = false;
 
-  // Number ipus total needed, replica * ipu_per_replica
+  // enable transfer cast Op target from fp32 to fp16 in fp16 mode
+  bool transfer_cast_op = true;
+
+  // The mode of Adam/Lamb optimizer
+  // false: The standard Adam/Lamb optimizer
+  // true: The Adam_No_Bias/Lamb_No_Bias optimizer from PopART
+  bool use_no_bias_optimizer = false;
+
+  // enable distributed computing for POD128 or POD256
+  bool enable_distribution = false;
+
+  // Number ipus total needed, local_replica * ipu_per_replica
   int num_ipus = 1;
 
   // batches per step
@@ -53,8 +63,8 @@ class IpuStrategy {
   // micro batch-size
   int micro_batch_size = 1;
 
-  // save paddle model per n steps
-  int save_per_n_step = 1;
+  // random seed
+  std::uint64_t random_seed = std::numeric_limits<std::uint64_t>::max();
 
   // TODO(alleng) remove this param
   // available memory proportion, 0.0f for disable
@@ -67,6 +77,29 @@ class IpuStrategy {
   // defaultMaxWeightNorm for adam optimizer
   float max_weight_norm = 65504.0f;
 
+  // file path for dumping compiled model in onnx format
+  std::string onnx_dump_path;
+
+  // Data type to use for tensor that stores first-order momentum optimizer
+  // state. FLOAT or FLOAT16
+  std::string accl1_type = "FLOAT";
+
+  // Data type to use for tensor that stores second-order momentum optimizer
+  // state. FLOAT or FLOAT16
+  std::string accl2_type = "FLOAT";
+
+  // Data type to use for tensor that stores third-order momentum optimizer
+  // state. FLOAT or FLOAT16
+  std::string accl3_type = "FLOAT";
+
+  // WeightDecayMode for setting the optimizer
+  // if set, it will override other settings
+  // value must be one of "decay" or "l2_regularization" or not set
+  std::string weight_decay_mode = "";
+
+  // Runtime Options
+  RuntimeOptions runtime_options;
+
   // popart session option
   popart::SessionOptions popart_options;
 
@@ -86,6 +119,8 @@ class IpuStrategy {
                               const std::string &value);
   void SetTensorLocation(const std::string &tensor, const std::string &option,
                          std::uint64_t value);
+  void SetAccumulateOuterFragmentSettings(const std::uint64_t &schedule,
+                                          const std::vector<int> &values);
   void AddCustomOp(const std::string &paddle_op, const std::string &popart_op,
                    const std::string &domain, int version);
 
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
index c980bb780cfc0..7d92835534513 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
@@ -34,15 +34,36 @@ Node *logical_not_handler(Graph *graph, Node *node) {
                       {GetOutputVarNode("Out", node)}, {});
 }
 
+Node *logical_or_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_logical_or",
+                      {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+                      {GetOutputVarNode("Out", node)}, {});
+}
+
+Node *logical_and_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_logical_and",
+                      {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+                      {GetOutputVarNode("Out", node)}, {});
+}
+
 Node *greater_than_handler(Graph *graph, Node *node) {
   return CreateBaseOp(graph, node, "popart_greater",
                       {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
                       {GetOutputVarNode("Out", node)}, {});
 }
 
+Node *less_than_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_less",
+                      {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+                      {GetOutputVarNode("Out", node)}, {});
+}
+
 REGISTER_HANDLER(equal, equal_handler);
 REGISTER_HANDLER(logical_not, logical_not_handler);
+REGISTER_HANDLER(logical_or, logical_or_handler);
+REGISTER_HANDLER(logical_and, logical_and_handler);
 REGISTER_HANDLER(greater_than, greater_than_handler);
+REGISTER_HANDLER(less_than, less_than_handler);
 
 }  // namespace
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
index d4a14a6d8409f..ba6675f40f400 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
@@ -98,6 +98,12 @@ Node *matmul_handler(Graph *graph, Node *node) {
   if (x_rank == 1) {
     perm = std::vector<int64_t>{0};
   } else if (x_rank == 2) {
+    if (!transpose_x && !transpose_y && is_float_equal(alpha, 1.0f)) {
+      return CreateBaseOp(
+          graph, node, "popart_matmul",
+          {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+          node->outputs);
+    }
     return CreateGemm(graph, node,
                       {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
                       node->outputs, transpose_x, transpose_y, alpha);
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
index 3ec1999edc4f0..0339097d58790 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
@@ -32,30 +32,10 @@ const std::string GenerateOpName() {
 
 const std::string CreateOpIdentifyId(Node *node) {
   // format:
-  //   if has custom op_namescope:
-  //      {op_namescope}/op_type/_gen_*
-  //   else:
-  //     {op_type}/{out_var0}/{out_var1}/.../_gen_*
+  //   op_type/_gen_*
   // this name will be used as op name when exporting onnx model from popart
   auto op_type = node->Name();
-  std::string op_namescope;
-  if (node->Op()->HasAttr("op_namescope")) {
-    op_namescope =
-        BOOST_GET_CONST(std::string, node->Op()->GetAttr("op_namescope"));
-  } else {
-    op_namescope = "/";
-  }
-
-  if (op_namescope != "/") {
-    return {op_namescope + op_type + "/" + GenerateOpName()};
-  } else {
-    std::string op_out = "";
-    for (auto *out_node : node->outputs) {
-      op_out += "/";
-      op_out += out_node->Name();
-    }
-    return {op_type + op_out + "/" + GenerateOpName()};
-  }
+  return {op_type + "/" + GenerateOpName()};
 }
 
 Node *MakeVarNode(Graph *graph, Node *node) {
@@ -122,6 +102,12 @@ Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type,
   if (node->Op()->HasAttr(sMatmulSerializeMode)) {
     CopyOpAttr(sMatmulSerializeMode, node->Op(), new_node->Op());
   }
+  if (node->Op()->HasAttr(sAvailMemAttribute)) {
+    CopyOpAttr(sAvailMemAttribute, node->Op(), new_node->Op());
+  }
+  if (node->Op()->HasAttr(sOpNamescope)) {
+    CopyOpAttr(sOpNamescope, node->Op(), new_node->Op());
+  }
   {
     new_node->Op()->SetAttr(sOpIdentifyIdAttr, CreateOpIdentifyId(node));
     new_node->Op()->Flush();
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc
index 0919afef4d83a..8bd0794368838 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc
@@ -54,10 +54,36 @@ Node *checkpointoutput_handler(Graph *graph, Node *node) {
                       node->outputs);
 }
 
+Node *custom_nll_loss_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto reduction = BOOST_GET_CONST(int, op->GetAttr("reduction"));
+  auto ignoreIndex = BOOST_GET_CONST(int, op->GetAttr("ignoreIndex"));
+  auto inputIsLogProbability =
+      BOOST_GET_CONST(bool, op->GetAttr("inputIsLogProbability"));
+  return CreateBaseOp(graph, node, "popart_nllloss_v2", node->inputs,
+                      node->outputs,
+                      {{"reduction", reduction},
+                       {"ignoreIndex", ignoreIndex},
+                       {"inputIsLogProbability", inputIsLogProbability}});
+}
+
+Node *identity_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_identity", node->inputs,
+                      node->outputs);
+}
+
+Node *detach_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_detach_v2", node->inputs,
+                      node->outputs);
+}
+
 REGISTER_HANDLER(custom_op, custom_op_handler);
 REGISTER_HANDLER(print, print_handler);
 REGISTER_HANDLER(popart_optimizer, popart_optimizer_handler);
 REGISTER_HANDLER(checkpointoutput, checkpointoutput_handler);
+REGISTER_HANDLER(custom_nll_loss, custom_nll_loss_handler);
+REGISTER_HANDLER(identity, identity_handler);
+REGISTER_HANDLER(detach, detach_handler);
 
 }  // namespace
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
index db429d2f62284..6ccb5441f8375 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
@@ -49,6 +49,9 @@ Node *fill_constant_handler(Graph *graph, Node *node) {
     case framework::proto::VarType::INT64:
       value = std::vector<int64_t>(size, value_);
       break;
+    case framework::proto::VarType::BOOL:
+      value = std::vector<bool>(size, value_);
+      break;
     default:
       PADDLE_THROW(
           platform::errors::Unimplemented("fill_constant dtype: %d", dtype_));
@@ -417,6 +420,45 @@ Node *assign_handler(Graph *graph, Node *node) {
                       {GetOutputVarNode("Out", node)}, {});
 }
 
+Node *assign_value_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto dtype_ = BOOST_GET_CONST(int, op->GetAttr("dtype"));
+  auto dtype = VarType2OnnxDtype(dtype_);
+  auto dims_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("shape"));
+  std::vector<int64_t> dims(dims_.begin(), dims_.end());
+  Attribute values;
+  std::string value_name;
+  switch (dtype_) {
+    case framework::proto::VarType::BOOL: {
+      value_name = "bool_values";
+      auto vec_int = BOOST_GET_CONST(std::vector<int>, op->GetAttr(value_name));
+      std::vector<bool> vec_bool(vec_int.begin(), vec_int.end());
+      values = vec_bool;
+    } break;
+    case framework::proto::VarType::INT32:
+      value_name = "int32_values";
+      values = BOOST_GET_CONST(std::vector<int>, op->GetAttr(value_name));
+      break;
+    case framework::proto::VarType::FP32:
+      value_name = "fp32_values";
+      values = BOOST_GET_CONST(std::vector<float>, op->GetAttr(value_name));
+      break;
+    case framework::proto::VarType::INT64:
+      value_name = "int64_values";
+      values = BOOST_GET_CONST(std::vector<int64_t>, op->GetAttr(value_name));
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported data type(code %d) for AssignValue operator, only "
+          "supports bool, int32, float32 and int64.",
+          dtype));
+  }
+  return CreateConst(graph, node, node->inputs, node->outputs,
+                     AttributeMap{
+                         {"value", values}, {"dims", dims}, {"dtype", dtype},
+                     });
+}
+
 Node *fill_any_like_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto value = BOOST_GET_CONST(float, op->GetAttr("value"));
@@ -482,6 +524,41 @@ Node *one_hot_handler(Graph *graph, Node *node) {
   }
 }
 
+Node *one_hot_v2_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto depth = BOOST_GET_CONST(int, op->GetAttr("depth"));
+  auto allow_out_of_range =
+      BOOST_GET_CONST(bool, op->GetAttr("allow_out_of_range"));
+  if (allow_out_of_range) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Do not support allow_out_of_range=True"));
+  } else {
+    auto depth_tensor =
+        CreateConst(graph, node, {}, {}, {{"value", std::vector<int>{depth}},
+                                          {"dims", std::vector<int64_t>{1}},
+                                          {"dtype", ONNXDataType::INT32}});
+    Node *value_tensor = nullptr;
+    if (GetOutputVarNode("Out", node)->Var()->GetDataType() ==
+        framework::proto::VarType::FP16) {
+      value_tensor =
+          CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{0, 1}},
+                                            {"dims", std::vector<int64_t>{2}},
+                                            {"dtype", ONNXDataType::FLOAT16}});
+    } else {
+      value_tensor =
+          CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{0, 1}},
+                                            {"dims", std::vector<int64_t>{2}},
+                                            {"dtype", ONNXDataType::FLOAT}});
+    }
+
+    return CreateBaseOp(graph, node, "popart_onehot",
+                        {GetInputVarNode("X", node), depth_tensor->outputs[0],
+                         value_tensor->outputs[0]},
+                        {GetOutputVarNode("Out", node)},
+                        {{"axis", int64_t{-1}}});
+  }
+}
+
 Node *split_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
@@ -510,10 +587,12 @@ REGISTER_HANDLER(shape, shape_handler);
 REGISTER_HANDLER(slice, slice_handler);
 REGISTER_HANDLER(expand, expand_handler);
 REGISTER_HANDLER(assign, assign_handler);
+REGISTER_HANDLER(assign_value, assign_value_handler);
 REGISTER_HANDLER(fill_any_like, fill_any_like_handler);
 REGISTER_HANDLER(lookup_table_v2, lookup_table_v2_handler);
 REGISTER_HANDLER(split, split_handler);
 REGISTER_HANDLER(one_hot, one_hot_handler);
+REGISTER_HANDLER(one_hot_v2, one_hot_v2_handler);
 
 }  // namespace
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index e6b08ed7bc340..57d6c5e119ccf 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -51,16 +51,20 @@ XPUOpMap& get_kl2_ops() {
       {"clip", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"concat_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                    pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                               pOpKernelType(vartype::FP16, XPUPlace())})},
       {"conv2d_transpose_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"conv2d_transpose",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"depthwise_conv2d_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
       {"depthwise_conv2d",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
       {"dropout_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -249,7 +253,7 @@ XPUOpMap& get_kl2_ops() {
       {"not_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                                   pOpKernelType(vartype::INT32, XPUPlace()),
                                   pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+      {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                                    pOpKernelType(vartype::INT64, XPUPlace())})},
       {"pool2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                                     pOpKernelType(vartype::FP16, XPUPlace())})},
@@ -323,6 +327,8 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"split", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                               pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"square_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"square", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"squeeze2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
@@ -349,6 +355,8 @@ XPUOpMap& get_kl2_ops() {
                                   pOpKernelType(vartype::FP16, XPUPlace())})},
       {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                              pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"tril_triu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace())})},
       {"tile", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                              pOpKernelType(vartype::INT64, XPUPlace()),
                              pOpKernelType(vartype::BOOL, XPUPlace()),
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index c5dff84723ccf..ce9b09f60ca35 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -56,6 +56,9 @@ XPUOpMap& get_kp_ops() {
       {"hard_shrink", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"hard_sigmoid",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"thresholded_relu",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
   };
 
   return s_xpu_kp_kernels;
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index b20e8ac9785ca..0738514336201 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -111,6 +111,22 @@ bool is_in_xpu_kpwhite_list(const std::string& op_name) {
 }
 #endif
 
+#ifdef PADDLE_WITH_XPU_KP
+std::vector<vartype::Type> get_xpu_kp_op_support_type(
+    const std::string& op_name, phi::backends::xpu::XPUVersion version) {
+  std::vector<vartype::Type> res;
+  auto& ops = version == phi::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops()
+                                                              : get_kp_ops();
+  if (ops.find(op_name) != ops.end()) {
+    XPUKernelSet& type_set = ops[op_name];
+    for (auto& item : type_set) {
+      res.push_back(item.data_type_);
+    }
+  }
+  return res;
+}
+#endif
+
 std::vector<vartype::Type> get_xpu_op_support_type(
     const std::string& op_name, phi::backends::xpu::XPUVersion version) {
   std::vector<vartype::Type> res;
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.h b/paddle/fluid/platform/device/xpu/xpu_op_list.h
index 455a38e36fe0a..60926dd9a5660 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.h
@@ -31,6 +31,8 @@ bool is_in_xpu_black_list(const std::string& op_name);
 bool is_xpu_kp_support_op(const std::string& op_name,
                           const pOpKernelType& type);
 bool is_in_xpu_kpwhite_list(const std::string& op_name);
+std::vector<vartype::Type> get_xpu_kp_op_support_type(
+    const std::string& op_name, phi::backends::xpu::XPUVersion version);
 #endif
 
 std::vector<vartype::Type> get_xpu_op_support_type(
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index e104170ca2495..2c5f24d28c6d6 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -916,6 +916,11 @@ class DeviceContextPool {
 
   size_t size() const { return device_contexts_.size(); }
 
+  const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>&
+  device_contexts() const {
+    return device_contexts_;
+  }
+
  private:
   static DeviceContextPool* pool;
   std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 87aa5dcde626b..1f95e12127104 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -46,8 +46,6 @@ if (WITH_MKLML)
     cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml phi_dynload_mklml)
 endif()
 
-cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader phi_dynload_lapack)
-add_dependencies(dynload_lapack extern_lapack)
 # TODO(TJ): add iomp, mkldnn?
 
 if (MKL_FOUND AND WITH_ONEMKL)
diff --git a/paddle/fluid/platform/dynload/lapack.h b/paddle/fluid/platform/dynload/lapack.h
deleted file mode 100644
index 59e04dbd2a1e7..0000000000000
--- a/paddle/fluid/platform/dynload/lapack.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <complex>
-#include <mutex>
-#include "paddle/phi/backends/dynload/lapack.h"
-#include "paddle/phi/common/complex.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load lapack routine
- * via operator overloading.
- */
-#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                     \
-  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
-  extern DynLoad__##__name __name
-
-#define DECLARE_DYNAMIC_LOAD_LAPACK_WRAP(__name) \
-  DYNAMIC_LOAD_LAPACK_WRAP(__name)
-
-#define LAPACK_ROUTINE_EACH(__macro) \
-  __macro(dgetrf_);                  \
-  __macro(sgetrf_);                  \
-  __macro(zheevd_);                  \
-  __macro(cheevd_);                  \
-  __macro(dsyevd_);                  \
-  __macro(ssyevd_);                  \
-  __macro(dgeev_);                   \
-  __macro(sgeev_);                   \
-  __macro(zgeev_);                   \
-  __macro(cgeev_);                   \
-  __macro(dgels_);                   \
-  __macro(sgels_);                   \
-  __macro(dgelsd_);                  \
-  __macro(sgelsd_);                  \
-  __macro(dgelsy_);                  \
-  __macro(sgelsy_);                  \
-  __macro(dgelss_);                  \
-  __macro(sgelss_);                  \
-  __macro(zpotrs_);                  \
-  __macro(cpotrs_);                  \
-  __macro(dpotrs_);                  \
-  __macro(spotrs_);
-
-LAPACK_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_LAPACK_WRAP);
-
-#undef DYNAMIC_LOAD_LAPACK_WRAP
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index ce2dba4db02a0..4001fd744e677 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -559,6 +559,34 @@ inline void GetGroupConvWeightsTz(std::vector<int64_t>& weights_tz,  // NOLINT
   }
 }
 
+inline void RegisterModelLayout(
+    std::vector<std::unique_ptr<framework::OperatorBase>>& ops,
+    const platform::Place& place) {
+  if (platform::is_cpu_place(place)) {
+    auto check_attrib = [](std::unique_ptr<framework::OperatorBase>& op,
+                           const std::string& attrib_name) -> bool {
+      if (op->HasAttr(attrib_name)) {
+        auto data_format = op->Attr<std::string>(attrib_name);
+        platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout(
+            data_format.compare("NHWC") == 0 ? framework::DataLayout::kNHWC
+                                             : framework::DataLayout::kNCHW);
+        return true;
+      } else {
+        return false;
+      }
+    };
+
+    for (auto& op : ops) {
+      if (check_attrib(op, std::string("data_format"))) {
+        return;
+      }
+      if (check_attrib(op, std::string("data_layout"))) {
+        return;
+      }
+    }
+  }
+}
+
 inline bool HasOpINT8DataType(const paddle::framework::OpDesc* op) {
   return (op->GetAttrIfExists<std::string>("mkldnn_data_type") == "int8" ||
           op->GetAttrIfExists<bool>("use_quantizer"));
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 01de7349f4823..1254331835bbd 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -618,7 +618,7 @@ class BinaryMKLDNNHandler
                       const dnnl::engine engine, platform::Place cpu_place,
                       const Tensor* x, const Tensor* y, Tensor* z,
                       float scale_x, float scale_y, float scale_z,
-                      const dnnl::post_ops& post_ops = dnnl::post_ops())
+                      const dnnl::post_ops& post_ops = dnnl::post_ops{})
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
     PADDLE_ENFORCE_EQ(
         x->layout(), DataLayout::kMKLDNN,
@@ -676,8 +676,8 @@ class BinaryMKLDNNHandler
     const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType<T>(),
                                      MKLDNNMemoryFormat::any);
 
-    auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z);
-    attributes.set_post_ops(post_ops);
+    auto attributes =
+        CreateAttributes(algo, scale_x, scale_y, scale_z, post_ops);
 
     this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md,
                                             dst_md);
@@ -690,10 +690,9 @@ class BinaryMKLDNNHandler
   }
 
  private:
-  static inline dnnl::primitive_attr CreateAttributes(dnnl::algorithm op,
-                                                      float scale_x,
-                                                      float scale_y,
-                                                      float scale_z) {
+  static inline dnnl::primitive_attr CreateAttributes(
+      dnnl::algorithm op, float scale_x, float scale_y, float scale_z,
+      dnnl::post_ops post_ops = dnnl::post_ops{}) {
     // Scales set in attributes for inputs contibute to the output equation
     // in the following way (assuming no broadcasting takes place):
     // output_i = scale_0 * x_i <+ or *> scale_1 * y_i;
@@ -718,6 +717,7 @@ class BinaryMKLDNNHandler
                           {scale_0});
     attributes.set_scales(/* input_y_id = */ DNNL_ARG_SRC_1, /* mask = */ 0,
                           {scale_1});
+    if (post_ops.len() > 0) attributes.set_post_ops(post_ops);
     return attributes;
   }
 };
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index feb72bce72bf8..940fc98d3b320 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -77,7 +77,9 @@ RecordEvent::RecordEvent(const char *name, const TracerEventType type,
 #endif
 #endif
   if (FLAGS_enable_host_event_recorder_hook == false) {
-    OriginalConstruct(name, role, "none");
+    if (g_state != ProfilerState::kDisabled) {  // avoid temp string
+      OriginalConstruct(name, role, "none");
+    }
     return;
   }
   if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
@@ -165,8 +167,8 @@ void RecordEvent::End() {
   }
 #endif
 #endif
-  uint64_t end_ns = PosixInNsec();
   if (LIKELY(FLAGS_enable_host_event_recorder_hook && is_enabled_)) {
+    uint64_t end_ns = PosixInNsec();
     if (LIKELY(shallow_copy_name_ != nullptr)) {
       HostEventRecorder::GetInstance().RecordEvent(
           shallow_copy_name_, start_ns_, end_ns, role_, type_);
@@ -190,6 +192,7 @@ void RecordEvent::End() {
   // lock is not needed, the code below is thread-safe
   DeviceTracer *tracer = GetDeviceTracer();
   if (tracer) {
+    uint64_t end_ns = PosixInNsec();
     tracer->AddCPURecords(CurAnnotationName(), start_ns_, end_ns, BlockDepth(),
                           g_thread_id);
   }
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index f40cd51a7b286..7b223f7ed27e2 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -44,6 +44,9 @@ endif()
 if(NOT WIN32)
   set(PYBIND_DEPS ${PYBIND_DEPS} data_loader)
   set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator)
+  if (WITH_GPU)
+    set(PYBIND_DEPS ${PYBIND_DEPS} cuda_ipc_allocator)
+  endif()
   if (WITH_NCCL OR WITH_RCCL)
     set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context)
     set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context)
@@ -80,6 +83,10 @@ set(PYBIND_SRCS
   communication.cc
   cuda_streams_py.cc)
 
+if (WITH_ONNXRUNTIME)
+  set(PYBIND_DEPS ${PYBIND_DEPS} onnxruntime_predictor)
+endif()
+
 if(NOT ON_INFER)
   set (PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
   if (WITH_NCCL)
@@ -88,7 +95,7 @@ if(NOT ON_INFER)
   if (WITH_GLOO)
     set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo)
   endif()
-  if(WITH_ASCEND)
+  if(WITH_ASCEND_CL)
     set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_hccl)
   endif()
   set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc)
@@ -152,6 +159,10 @@ if(WITH_PYTHON)
     list(APPEND OP_FUNCTION_GENERETOR_DEPS hccl_context)
   endif(WITH_ASCEND_CL)
 
+  if (WITH_ONNXRUNTIME)
+    list(APPEND OP_FUNCTION_GENERETOR_DEPS onnxruntime_predictor)
+  endif()
+
   if(WITH_CNCL)
     list(APPEND OP_FUNCTION_GENERETOR_DEPS cncl_context)
   endif(WITH_CNCL)
@@ -242,6 +253,19 @@ if(WITH_PYTHON)
         list(APPEND OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
         list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
     endif()
+    if(WITH_ONNXRUNTIME)
+      ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR}
+        DEPENDS paddle2onnx)
+      list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll)
+      list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll)
+
+      ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR} 
+        DEPENDS onnxruntime)
+      list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll)
+      list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll)
+    endif()
 
     add_custom_command(OUTPUT ${impl_file}
       COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat
@@ -260,6 +284,28 @@ if(WITH_PYTHON)
     # copy these *.so to current directory and append current directory to
     # LD_LIBRARY_PATH. This is different with Windows platformm, which search
     # *.dll in current directory automatically.
+    if(WITH_ONNXRUNTIME)
+      if (APPLE)
+        set(PADDLE2ONNX_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.dylib)
+        set(ONNXRUNTIME_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.dylib)
+      else()
+        set(PADDLE2ONNX_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.so)
+        set(ONNXRUNTIME_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.so)
+      endif()
+
+      ADD_CUSTOM_COMMAND(OUTPUT ${PADDLE2ONNX_PYBIND_OUT}
+        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_LIB} ${CMAKE_CURRENT_BINARY_DIR}
+        DEPENDS paddle2onnx)
+      list(APPEND OP_IMPL_DEPS ${PADDLE2ONNX_PYBIND_OUT})
+      list(APPEND EAGER_OP_IMPL_DEPS ${PADDLE2ONNX_PYBIND_OUT})
+
+      ADD_CUSTOM_COMMAND(OUTPUT ${ONNXRUNTIME_PYBIND_OUT}
+        COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_LIB} ${CMAKE_CURRENT_BINARY_DIR} 
+        DEPENDS onnxruntime)
+      list(APPEND OP_IMPL_DEPS ${ONNXRUNTIME_PYBIND_OUT})
+      list(APPEND EAGER_OP_IMPL_DEPS ${ONNXRUNTIME_PYBIND_OUT})
+    endif()
+
     if(WITH_MKLML)
       ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so
         COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${CMAKE_CURRENT_BINARY_DIR}
@@ -305,7 +351,7 @@ if(WITH_PYTHON)
   if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     cc_library(paddle_eager
     SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc
-    DEPS eager_api autograd_meta backward grad_node_info phi op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node global_utils utils python)
+    DEPS eager_api autograd_meta backward grad_node_info phi op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node global_utils utils python custom_operator custom_operator_node)
     add_dependencies(paddle_eager eager_codegen)
     add_dependencies(paddle_eager eager_op_function_generator_cmd)
     list(APPEND PYBIND_DEPS paddle_eager)
diff --git a/paddle/fluid/pybind/custom_handwrite_op_funcs.h b/paddle/fluid/pybind/custom_handwrite_op_funcs.h
index 7a276df0d5bdc..3b898ce77ce6f 100644
--- a/paddle/fluid/pybind/custom_handwrite_op_funcs.h
+++ b/paddle/fluid/pybind/custom_handwrite_op_funcs.h
@@ -31,7 +31,6 @@ static PyObject *eager_api_run_program(PyObject *self, PyObject *args,
 
     tstate = PyEval_SaveThread();
     run_program_dygraph_function(X, Params, Out, OutScope, DOut, attrs);
-    std::cout << "end run_program_dygraph_function" << std::endl;
     PyEval_RestoreThread(tstate);
     tstate = nullptr;
   } catch (...) {
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index 0b1796703817c..e89d8d96342e7 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -51,6 +51,18 @@ namespace pybind {
 
 using Tensor = paddle::experimental::Tensor;
 
+std::shared_ptr<distributed::EagerReducer> CreateEagerReducer(
+    py::handle py_tensors,
+    const std::vector<std::vector<size_t>> &group_indices,
+    const std::vector<bool> &is_sparse_gradient,
+    std::shared_ptr<distributed::ProcessGroup> process_group,
+    const std::vector<size_t> &group_size_limits, bool find_unused_parameters) {
+  auto params = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
+  return std::make_shared<distributed::EagerReducer>(
+      params, group_indices, is_sparse_gradient, process_group,
+      group_size_limits, find_unused_parameters);
+}
+
 #if defined(PADDLE_WITH_GLOO)
 using ProcessGroupGloo = paddle::distributed::ProcessGroupGloo;
 using GlooStore = paddle::distributed::ProcessGroupGloo::GlooStore;
@@ -223,25 +235,13 @@ void BindDistributed(py::module *m) {
            py::call_guard<py::gil_scoped_release>());
 
 #if defined(PADDLE_WITH_GLOO)
-  py::class_<GlooOptions>(*m, "GlooOptions")
-      .def(py::init<>())
-      .def_readwrite("_device", &GlooOptions::device)
-      .def_static("create", &GlooOptions::create);
-
-  py::class_<GlooStore, std::shared_ptr<GlooStore>>(*m, "GlooStore")
-      .def(py::init(
-               [](const std::shared_ptr<paddle::distributed::TCPStore> &store) {
-                 return std::make_shared<GlooStore>(store);
-               }),
-           py::call_guard<py::gil_scoped_release>());
-
   py::class_<ProcessGroupGloo, std::shared_ptr<ProcessGroupGloo>>(
       *m, "ProcessGroupGloo", ProcessGroup)
-      .def(py::init<const std::shared_ptr<GlooStore> &, int, int,
-                    std::shared_ptr<GlooOptions> &>(),
+      .def(py::init<const std::shared_ptr<paddle::distributed::Store> &, int,
+                    int, std::shared_ptr<GlooOptions> &>(),
            py::call_guard<py::gil_scoped_release>())
-      .def(py::init([](const std::shared_ptr<GlooStore> &store, int rank,
-                       int world_size) {
+      .def(py::init([](const std::shared_ptr<paddle::distributed::Store> &store,
+                       int rank, int world_size) {
              auto opts = GlooOptions::create();
              char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str());
              if (ifname && strlen(ifname) > 1) {
@@ -271,6 +271,17 @@ void BindDistributed(py::module *m) {
          py::arg("group_size_limits") = std::vector<size_t>{25 * 1024 * 1024},
          py::arg("tensor_indices") = std::vector<int64_t>{},
          py::call_guard<py::gil_scoped_release>());
+
+  py::class_<distributed::EagerReducer,
+             std::shared_ptr<distributed::EagerReducer>>(*m, "EagerReducer",
+                                                         R"DOC()DOC")
+      .def(py::init(&CreateEagerReducer))
+      .def("prepare_for_backward",
+           [](distributed::EagerReducer &self, py::handle py_tensors) {
+             auto params = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
+             self.PrepareForBackward(params);
+           },
+           py::arg("tensors"), py::call_guard<py::gil_scoped_release>());
 }
 
 }  // end namespace pybind
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 0b04dc7347ce7..528bd75eb0013 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -21,21 +21,25 @@ limitations under the License. */
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/backward.h"
+#include "paddle/fluid/eager/custom_operator/custom_operator_node.h"
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/fluid/framework/op_meta_info_helper.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/exception.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
-
 namespace paddle {
 namespace pybind {
 
@@ -118,13 +122,33 @@ static PyObject* eager_api_run_backward(PyObject* self, PyObject* args,
   EAGER_TRY
   auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
   auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1);
-  egr::RunBackward(tensors, grad_tensors,
-                   CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2));
+  egr::Backward(tensors, grad_tensors,
+                CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2));
   Py_INCREF(Py_None);
   return Py_None;
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* eager_api_run_partial_grad(PyObject* self, PyObject* args,
+                                            PyObject* kwargs) {
+  EAGER_TRY
+  auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
+  auto inputs = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1);
+  auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 2), 2);
+  auto retain_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
+  auto create_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4);
+  auto only_inputs = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 5), 5);
+  auto allow_unused = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 6), 6);
+  auto no_grad_vars = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 7), 7);
+
+  std::vector<paddle::experimental::Tensor> result =
+      egr::Grad(tensors, inputs, grad_tensors, retain_graph, create_graph,
+                only_inputs, allow_unused, no_grad_vars);
+  VLOG(1) << " in eager_api_run_partial_grad, after runing egr::Grad";
+  return ToPyObject(result, true /* return_py_none_if_not_initialize */);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args,
                                        PyObject* kwargs) {
   EAGER_TRY
@@ -168,7 +192,284 @@ static PyObject* eager_api_read_next_tensor_list(PyObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static void ConstructFwdAndBwdMap(
+    const std::vector<paddle::OpMetaInfo>& vec_map,
+    const std::string& op_type) {
+  auto& in_out_map = egr::Controller::Instance().GetCustomEdgesSlotMap();
+  if (in_out_map.find(op_type) != in_out_map.end()) {
+    VLOG(7) << "Find Exist CustomEdgesSlotMap Skip >>>> ";
+    return;
+  } else {
+    VLOG(7) << "Construct CustomEdgesSlotMap ";
+    auto inputs_names =
+        paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[0]);
+    auto outputs_names =
+        paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[0]);
+    auto attrs_names =
+        paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[0]);
+    auto grad_outputs_names =
+        paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[1]);
+    auto grad_inputs_names =
+        paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[1]);
+    auto grad_attrs_names =
+        paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[1]);
+    std::vector<std::unordered_map<int, int>> res(5);
+    in_out_map.insert({op_type, res});
+    // Prepare pos map for grad_outputs
+    VLOG(7) << "Prepare pos map for grad_outputs";
+    PADDLE_ENFORCE_LE(
+        grad_outputs_names.size(), inputs_names.size(),
+        paddle::platform::errors::InvalidArgument(
+            "Grad outputs num should be less equal than forward inputs num."));
+    for (size_t i = 0; i < grad_outputs_names.size(); i++) {
+      size_t end = grad_outputs_names[i].find("@GRAD");
+      PADDLE_ENFORCE_NE(
+          end, std::string::npos,
+          paddle::platform::errors::NotFound(
+              "All Grad outputs should be grad and we got %s is not grad var, "
+              "please check your op and change to fit the rule.",
+              grad_outputs_names[i]));
+      for (size_t j = 0; j < inputs_names.size(); j++) {
+        if (grad_outputs_names[i].substr(0, end) == inputs_names[j]) {
+          VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
+                  << " inputs: " << inputs_names[j] << " related to No." << i
+                  << " grad_outputs: " << grad_outputs_names[i];
+          in_out_map[op_type][0][j] = i;
+        }
+      }
+    }
+    // Prepare pos map for grad_inputs
+    for (size_t i = 0; i < grad_inputs_names.size(); i++) {
+      size_t end = grad_inputs_names[i].find("@GRAD");
+      if (end != std::string::npos) {
+        for (size_t j = 0; j < outputs_names.size(); j++) {
+          if (grad_inputs_names[i].substr(0, end) == outputs_names[j]) {
+            VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
+                    << " outputs: " << outputs_names[j] << " related to No."
+                    << i << " grad_inputs's grad: " << grad_inputs_names[i];
+            in_out_map[op_type][1][j] = i;
+          }
+        }
+      } else {
+        if (std::find(outputs_names.begin(), outputs_names.end(),
+                      grad_inputs_names[i]) != outputs_names.end()) {
+          for (size_t j = 0; j < outputs_names.size(); j++) {
+            if (grad_inputs_names[i] == outputs_names[j]) {
+              VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
+                      << " outputs: " << outputs_names[j] << " related to No."
+                      << i
+                      << " grad_inputs fwd outputs: " << grad_inputs_names[i];
+              in_out_map[op_type][2][j] = i;
+            }
+          }
+        } else {
+          for (size_t j = 0; j < inputs_names.size(); j++) {
+            if (grad_inputs_names[i] == inputs_names[j]) {
+              VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
+                      << " inputs: " << inputs_names[j] << " related to No."
+                      << i
+                      << " grad_inputs fwd inputs: " << grad_inputs_names[i];
+              in_out_map[op_type][3][j] = i;
+            }
+          }
+        }
+      }
+    }
+
+    // Prepare pos map for grad attrs_
+    for (size_t i = 0; i < grad_attrs_names.size(); i++) {
+      auto end = std::find(attrs_names.begin(), attrs_names.end(),
+                           grad_attrs_names[i]);
+      PADDLE_ENFORCE_NE(end, attrs_names.end(),
+                        paddle::platform::errors::NotFound(
+                            "All Grad attrs should be one of forward attrs and "
+                            "we got %s is not one of them, please check your "
+                            "op and change to fit the rule.",
+                            grad_attrs_names[i]));
+      for (size_t j = 0; j < attrs_names.size(); j++) {
+        if (grad_attrs_names[i] == attrs_names[j]) {
+          VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
+                  << " attrs: " << attrs_names[j] << " related to No." << i
+                  << " grad_attrs: " << grad_attrs_names[i];
+          in_out_map[op_type][4][j] = i;
+        }
+      }
+    }
+  }
+}
+
+static std::vector<paddle::any> CastAttrsToTragetType(
+    const std::vector<paddle::any>& src,
+    const std::vector<std::string>& attrs_names) {
+  std::vector<paddle::any> res;
+  PADDLE_ENFORCE_EQ(src.size(), attrs_names.size(),
+                    paddle::platform::errors::InvalidArgument(
+                        "We Expected same size of attrs and attrs_name list, "
+                        "if u got this error indicate your custom op setting "
+                        "%s attrs, but you just give %s",
+                        attrs_names.size(), src.size()));
+  for (size_t i = 0; i < src.size(); i++) {
+    size_t end = attrs_names[i].find(": ");
+    std::string type_name =
+        attrs_names[i].substr(end + 2, attrs_names.size() - end - 2);
+    if (type_name == "int") {
+      if (src[i].type() == typeid(bool)) {
+        res.emplace_back(static_cast<int>(paddle::any_cast<bool>(src[i])));
+      } else if (src[i].type() == typeid(int)) {
+        res.emplace_back(src[i]);
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Your No. %s attrs should only can be bool or int32, other type is "
+            "forbidden for now but we got %s. Check your code first please",
+            i, src[i].type().name()));
+      }
+    } else if (type_name == "int64_t") {
+      if (src[i].type() == typeid(bool)) {
+        res.emplace_back(static_cast<int64_t>(paddle::any_cast<bool>(src[i])));
+      } else if (src[i].type() == typeid(int)) {
+        res.emplace_back(static_cast<int64_t>(paddle::any_cast<int>(src[i])));
+      } else if (src[i].type() == typeid(int64_t)) {
+        res.emplace_back(src[i]);
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Your No. %s attrs should only can be bool or int32 or int64_t, "
+            "other type is forbidden for now but we got %s. Check your code "
+            "first please",
+            i, src[i].type().name()));
+      }
+    } else {
+      res.emplace_back(src[i]);
+    }
+  }
+  return res;
+}
+
+static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
+                                         PyObject* kwargs) {
+  EAGER_TRY
+  paddle::CustomOpKernelContext ctx =
+      CastPyArg2CustomOpKernelContext(PyTuple_GET_ITEM(args, 0), 0);
+  std::string op_type = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 1), 1);
+  bool trace_backward = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2);
+  VLOG(7) << "Get things for python for Custom Op: " << op_type
+          << ", trace_backward is: " << trace_backward;
+  auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap();
+  PADDLE_ENFORCE_NE(meta_info_map.find(op_type), meta_info_map.end(),
+                    paddle::platform::errors::NotFound(
+                        "Can't find %s in Eager OpMetaInfoMap which should be "
+                        "created by LoadOpMetaInfoAndRegisterOp, please make "
+                        "sure you registered your op first and try again. ",
+                        op_type));
+  VLOG(7) << "Run Kernel of Custom Op: " << op_type;
+  std::vector<paddle::any> res_attrs = CastAttrsToTragetType(
+      ctx.Attrs(), paddle::framework::OpMetaInfoHelper::GetAttrs(
+                       meta_info_map.at(op_type)[0]));
+  ctx.EmplaceBackAttrs(res_attrs);
+  const auto& vec_map = meta_info_map.at(op_type);
+  (*paddle::framework::OpMetaInfoHelper::GetKernelFn(vec_map[0]))(&ctx);
+
+  VLOG(7) << "Get AutogradMeta for inputs and outputs for Custom Op";
+  std::vector<std::vector<egr::AutogradMeta*>> ins_auto_grad_metas;
+  std::vector<std::vector<egr::AutogradMeta*>> outs_auto_grad_metas;
+  VLOG(7) << "We got slot num of ins is: " << ctx.InputRange().size();
+  ins_auto_grad_metas.resize(ctx.InputRange().size());
+  VLOG(7) << "We got slot num of outs is: " << ctx.OutputRange().size();
+  outs_auto_grad_metas.resize(ctx.OutputRange().size());
+
+  for (size_t i = 0; i < ctx.InputRange().size(); i++) {
+    ins_auto_grad_metas[i] =
+        egr::EagerUtils::nullable_autograd_meta(ctx.InputsBetween(
+            ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second));
+  }
+  for (size_t i = 0; i < ctx.OutputRange().size(); i++) {
+    outs_auto_grad_metas[i] =
+        egr::EagerUtils::unsafe_autograd_meta(ctx.OutputsBetweeen(
+            ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second));
+  }
+  bool require_any_grad = false;
+  for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) {
+    require_any_grad =
+        require_any_grad || egr::EagerUtils::ComputeRequireGrad(
+                                trace_backward, &(ins_auto_grad_metas[i]));
+  }
+  if (require_any_grad) {
+    VLOG(6) << " Construct Grad for Custom Op: " << op_type;
+    ConstructFwdAndBwdMap(vec_map, op_type);
+    for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
+      egr::EagerUtils::PassStopGradient(false, &(outs_auto_grad_metas[i]));
+    }
+    auto grad_node = std::make_shared<egr::RunCustomOpNode>(
+        outs_auto_grad_metas.size(), ins_auto_grad_metas.size(), op_type);
+    auto slot_map =
+        egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type);
+    // Prepare Grad outputs
+    size_t no_grad_cnt = 0;
+    for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) {
+      const std::vector<paddle::experimental::Tensor>& in_tensors =
+          ctx.InputsBetween(ctx.InputRangeAt(i).first,
+                            ctx.InputRangeAt(i).second);
+
+      if (slot_map[0].find(i) != slot_map[0].end()) {
+        grad_node->SetGradOutMeta(in_tensors, slot_map[0][i]);
+        grad_node->AddEdges(&ins_auto_grad_metas[i], slot_map[0][i]);
+      } else {
+        grad_node->SetGradOutMeta(in_tensors,
+                                  ins_auto_grad_metas.size() - 1 - no_grad_cnt);
+        grad_node->AddEdges(&ins_auto_grad_metas[i],
+                            ins_auto_grad_metas.size() - 1 - no_grad_cnt);
+        no_grad_cnt++;
+      }
+    }
+    // Prepare Grad inputs with grad of fwd outputs
+    for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
+      const std::vector<paddle::experimental::Tensor>& out_tensors =
+          ctx.OutputsBetweeen(ctx.OutputRangeAt(i).first,
+                              ctx.OutputRangeAt(i).second);
+
+      egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i);
+      egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node);
+      grad_node->SetGradInMeta(out_tensors, i);
+      egr::EagerUtils::CheckAndRetainGrad(out_tensors);
+    }
+
+    // Prepare Grad inputs with fwd outputs
+    for (auto it = slot_map[2].begin(); it != slot_map[2].end(); it++) {
+      VLOG(7) << "Prepare fwd_outs: " << it->first
+              << " to grad_inputs: " << it->second;
+      grad_node->fwd_outs[it->second] =
+          egr::RunCustomOpNode::ConstructTensorWrapper(
+              ctx.OutputsBetweeen(ctx.OutputRangeAt(it->first).first,
+                                  ctx.OutputRangeAt(it->first).second));
+    }
+
+    // Prepare Grad inputs with fwd inputs
+    for (auto it = slot_map[3].begin(); it != slot_map[3].end(); it++) {
+      VLOG(7) << "Prepare fwd_ins: " << it->first
+              << " to grad_inputs: " << it->second;
+      grad_node->fwd_ins[it->second] =
+          egr::RunCustomOpNode::ConstructTensorWrapper(
+              ctx.InputsBetween(ctx.InputRangeAt(it->first).first,
+                                ctx.InputRangeAt(it->first).second));
+    }
+
+    auto attrs_names = paddle::framework::OpMetaInfoHelper::GetAttrs(
+        meta_info_map.at(op_type)[1]);
+    std::vector<paddle::any> attrs(attrs_names.size());
+    // Prepare attrs for Grad node
+    for (auto it = slot_map[4].begin(); it != slot_map[4].end(); it++) {
+      VLOG(7) << "Prepare fwd attrs: " << it->first
+              << " to grad_attrs: " << it->second;
+      attrs[it->second] = res_attrs[it->first];
+    }
+    grad_node->SetAttrs(attrs);
+  }
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 PyMethodDef variable_functions[] = {
+    // TODO(jiabin): Remove scale when we have final state tests
     {"scale", (PyCFunction)(void (*)(void))eager_api_scale,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_set_expected_place",
@@ -179,6 +480,11 @@ PyMethodDef variable_functions[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"run_backward", (PyCFunction)(void (*)(void))eager_api_run_backward,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"run_partial_grad",
+     (PyCFunction)(void (*)(void))eager_api_run_partial_grad,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_run_custom_op", (PyCFunction)(void (*)(void))eager_api_run_costum_op,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {"tensor_copy", (PyCFunction)(void (*)(void))eager_api_tensor_copy,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"read_next_tensor_list",
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index e5f22338dc615..49745e5679d9a 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -36,6 +36,8 @@ limitations under the License. */
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
 
 namespace paddle {
 namespace pybind {
@@ -214,8 +216,8 @@ static PyObject* tensor_method__is_initialized(TensorObject* self,
 static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
                                         PyObject* kwargs) {
   EAGER_TRY
-  bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 0), 0);
-  auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 1), 1);
+  auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 0), 0);
+  bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1);
   auto cp_tensor =
       self->tensor.copy_to(phi::TransToPhiBackend(place), blocking);
   egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
@@ -226,6 +228,19 @@ static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* tensor_method_cpu(TensorObject* self, PyObject* args,
+                                   PyObject* kwargs) {
+  EAGER_TRY
+  auto cp_tensor =
+      self->tensor.copy_to(phi::TransToPhiBackend(phi::CPUPlace()), true);
+  egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
+  egr::EagerUtils::autograd_meta(&cp_tensor)
+      ->SetPersistable(
+          egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable());
+  return ToPyObject(cp_tensor);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* tensor_method_reconstruct_from_(TensorObject* self,
                                                  PyObject* args,
                                                  PyObject* kwargs) {
@@ -264,7 +279,7 @@ static PyObject* tensor_method_copy_(TensorObject* self, PyObject* args,
             egr::EagerUtils::autograd_meta(&(src_tensor))->Persistable());
   }
 
-  self->tensor.copy_(src_tensor, blocking);
+  self->tensor.copy_(src_tensor, self->tensor.inner_place(), blocking);
 
   VLOG(6) << "Finish Copy Tensor " << src_tensor.name() << " to "
           << self->tensor.name();
@@ -314,23 +329,25 @@ static PyObject* tensor_clear_gradient(TensorObject* self, PyObject* args,
     grad = meta->MutableGrad();
   }
 
-  if (grad->is_selected_rows()) {
-    auto selected_rows =
-        std::dynamic_pointer_cast<phi::SelectedRows>(grad->impl());
-    if (selected_rows->mutable_value()->IsInitialized()) {
-      selected_rows->mutable_rows()->clear();
-      selected_rows->mutable_value()->clear();
-    }
-  } else if (grad->is_dense_tensor()) {
-    if (grad->initialized()) {
-      if (set_to_zero) {
-        grad->set_impl(paddle::experimental::zeros_like(*grad).impl());
-      } else {
-        VLOG(4) << "Gradient of " << self->tensor.name()
-                << " is initialized, will be released.";
-        auto dense_tensor =
-            std::dynamic_pointer_cast<phi::DenseTensor>(grad->impl());
-        dense_tensor->MoveMemoryHolder();
+  if (grad->impl()) {
+    if (grad->is_selected_rows()) {
+      auto selected_rows =
+          std::dynamic_pointer_cast<phi::SelectedRows>(grad->impl());
+      if (selected_rows->mutable_value()->IsInitialized()) {
+        selected_rows->mutable_rows()->clear();
+        selected_rows->mutable_value()->clear();
+      }
+    } else if (grad->is_dense_tensor()) {
+      if (grad->initialized()) {
+        if (set_to_zero) {
+          grad->set_impl(paddle::experimental::zeros_like(*grad).impl());
+        } else {
+          VLOG(4) << "Gradient of " << self->tensor.name()
+                  << " is initialized, will be released.";
+          auto dense_tensor =
+              std::dynamic_pointer_cast<phi::DenseTensor>(grad->impl());
+          dense_tensor->MoveMemoryHolder();
+        }
       }
     }
   }
@@ -688,6 +705,122 @@ static PyObject* tensor_register_reduce_hook(TensorObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* set_grad_type(TensorObject* self, PyObject* args,
+                               PyObject* kwargs) {
+  EAGER_TRY
+  auto var_type = pybind::CastPyArg2ProtoType(PyTuple_GET_ITEM(args, 0), 0);
+  auto grad_tensor =
+      egr::EagerUtils::unsafe_autograd_meta(self->tensor)->Grad();
+  if (var_type == framework::proto::VarType::LOD_TENSOR) {
+    grad_tensor.set_impl(std::make_shared<phi::DenseTensor>());
+  } else if (var_type == framework::proto::VarType::SELECTED_ROWS) {
+    grad_tensor.set_impl(std::make_shared<phi::SelectedRows>());
+  }
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_get_non_zero_indices(TensorObject* self,
+                                                    PyObject* args,
+                                                    PyObject* kwargs) {
+  EAGER_TRY
+  PADDLE_ENFORCE(self->tensor.is_sparse_coo_tensor(),
+                 paddle::platform::errors::Fatal(
+                     "this method is only effective for SparseCooTensor"));
+  auto sparse_coo_tensor =
+      std::dynamic_pointer_cast<phi::SparseCooTensor>(self->tensor.impl());
+  paddle::experimental::Tensor tensor(std::make_shared<phi::DenseTensor>(
+      sparse_coo_tensor->non_zero_indices()));
+  return ToPyObject(tensor);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_get_non_zero_elements(TensorObject* self,
+                                                     PyObject* args,
+                                                     PyObject* kwargs) {
+  EAGER_TRY
+  PADDLE_ENFORCE(
+      self->tensor.is_sparse_coo_tensor() ||
+          self->tensor.is_sparse_csr_tensor(),
+      paddle::platform::errors::Fatal("this method is only effective for "
+                                      "SparseCooTensor or SparseCsrTensor"));
+  if (self->tensor.is_sparse_coo_tensor()) {
+    auto sparse_coo_tensor =
+        std::dynamic_pointer_cast<phi::SparseCooTensor>(self->tensor.impl());
+    paddle::experimental::Tensor tensor(std::make_shared<phi::DenseTensor>(
+        sparse_coo_tensor->non_zero_elements()));
+    return ToPyObject(tensor);
+  } else {
+    auto sparse_csr_tensor =
+        std::dynamic_pointer_cast<phi::SparseCsrTensor>(self->tensor.impl());
+    paddle::experimental::Tensor tensor(std::make_shared<phi::DenseTensor>(
+        sparse_csr_tensor->non_zero_elements()));
+    return ToPyObject(tensor);
+  }
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_get_non_zero_crows(TensorObject* self,
+                                                  PyObject* args,
+                                                  PyObject* kwargs) {
+  EAGER_TRY
+  PADDLE_ENFORCE(self->tensor.is_sparse_csr_tensor(),
+                 paddle::platform::errors::Fatal(
+                     "this method is only effective for SparseCsrTensor"));
+  auto sparse_csr_tensor =
+      std::dynamic_pointer_cast<phi::SparseCsrTensor>(self->tensor.impl());
+  paddle::experimental::Tensor tensor(
+      std::make_shared<phi::DenseTensor>(sparse_csr_tensor->non_zero_crows()));
+  return ToPyObject(tensor);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_get_non_zero_cols(TensorObject* self,
+                                                 PyObject* args,
+                                                 PyObject* kwargs) {
+  EAGER_TRY
+  PADDLE_ENFORCE(self->tensor.is_sparse_csr_tensor(),
+                 paddle::platform::errors::Fatal(
+                     "this method is only effective for SparseCsrTensor"));
+  auto sparse_csr_tensor =
+      std::dynamic_pointer_cast<phi::SparseCsrTensor>(self->tensor.impl());
+  paddle::experimental::Tensor tensor(
+      std::make_shared<phi::DenseTensor>(sparse_csr_tensor->non_zero_cols()));
+  return ToPyObject(tensor);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_is_sparse(TensorObject* self, PyObject* args,
+                                         PyObject* kwargs) {
+  EAGER_TRY
+  return ToPyObject(self->tensor.is_sparse_coo_tensor() ||
+                    self->tensor.is_sparse_csr_tensor());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_is_sparse_coo(TensorObject* self, PyObject* args,
+                                             PyObject* kwargs) {
+  EAGER_TRY
+  return ToPyObject(self->tensor.is_sparse_coo_tensor());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_is_sparse_csr(TensorObject* self, PyObject* args,
+                                             PyObject* kwargs) {
+  EAGER_TRY
+  return ToPyObject(self->tensor.is_sparse_csr_tensor());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor__inplace_version(TensorObject* self, PyObject* args,
+                                         PyObject* kwargs) {
+  EAGER_TRY
+  uint32_t inplace_version = self->tensor.current_inplace_version();
+
+  return ToPyObject(inplace_version);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 PyMethodDef variable_methods[] = {
     {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy,
      METH_VARARGS | METH_KEYWORDS, NULL},
@@ -734,6 +867,30 @@ PyMethodDef variable_methods[] = {
     {"_register_backward_hook",
      (PyCFunction)(void (*)(void))tensor_register_reduce_hook,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_set_grad_type", (PyCFunction)(void (*)(void))set_grad_type,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    /***the method of sparse tensor****/
+    {"non_zero_indices",
+     (PyCFunction)(void (*)(void))tensor_method_get_non_zero_indices,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"non_zero_elements",
+     (PyCFunction)(void (*)(void))tensor_method_get_non_zero_elements,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"non_zero_crows",
+     (PyCFunction)(void (*)(void))tensor_method_get_non_zero_crows,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"non_zero_cols",
+     (PyCFunction)(void (*)(void))tensor_method_get_non_zero_cols,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"is_sparse", (PyCFunction)(void (*)(void))tensor_method_is_sparse,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"is_sparse_coo", (PyCFunction)(void (*)(void))tensor_method_is_sparse_coo,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"is_sparse_csr", (PyCFunction)(void (*)(void))tensor_method_is_sparse_csr,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    /***the method of sparse tensor****/
+    {"_inplace_version", (PyCFunction)(void (*)(void))tensor__inplace_version,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {NULL, NULL, 0, NULL}};
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
index 102cdbb91ab06..685e20aef2591 100644
--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -162,17 +162,22 @@ static inline std::string TempName(const std::string& name) {
 
 std::string GenerateOpFunctionsBody(
     const paddle::framework::proto::OpProto* op_proto, std::string func_name,
-    bool use_inplace_strategy = false,
     std::map<std::string, std::string> inplace_map = {}) {
   auto& op_type = op_proto->type();
   std::string input_args = "";
-  std::string call_api_str = "auto out = " + op_type + "_dygraph_function(";
+  std::string call_api_str = "";
   std::string ins_initializer_with_null = "";
   std::string py_arg = "";
   int arg_idx = 0;
   int input_args_num = 0;
   std::string ins_cast_str = "";
   std::string view_strategy_str = "";
+  if (!inplace_map.empty()) {
+    // change call_api_str for inplace op
+    call_api_str = "auto out = " + op_type + "__dygraph_function(";
+  } else {
+    call_api_str = "auto out = " + op_type + "_dygraph_function(";
+  }
   for (auto& input : op_proto->inputs()) {
     auto& in_name = input.name();
     // skip those dispensable inputs, like ResidualData in conv2d
@@ -288,8 +293,31 @@ std::string GenerateOpFunctionsBody(
         HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT, viwe_input_name, viwe_output_name,
         viwe_input_name, viwe_output_name);
   }
-
-  return_str = "return ToPyObject(out);";
+  if (!inplace_map.empty()) {
+    // For inplace op, Use the input PyObject directly.
+    for (auto& inplace_pair : inplace_map) {
+      // Find index of inplace tensor, and directly use input PyObject.
+      std::string inplace_arg_name = inplace_pair.second;
+      std::string inplace_return_name = inplace_pair.first;
+      const char* RETURN_INPLACE_TENSOR_TEMPLATE =
+          "ssize_t arg_id = GetIdxFromCoreOpsInfoMap(core_ops_args_info, "
+          "\"%s\", \"%s\");\n"
+          "    ssize_t return_id = "
+          "GetIdxFromCoreOpsInfoMap(core_ops_returns_info, \"%s\", \"%s\");\n"
+          "    return ToPyObject(out, return_id, args, arg_id);";
+      return_str = paddle::string::Sprintf(RETURN_INPLACE_TENSOR_TEMPLATE,
+                                           op_type, inplace_arg_name, op_type,
+                                           inplace_return_name);
+      // only support one inplace_var in temporary.
+      PADDLE_ENFORCE_EQ(
+          inplace_map.size(), 1,
+          paddle::platform::errors::InvalidArgument(
+              "size of inplace_map must be 1, but got %d", inplace_map.size()));
+      break;
+    }
+  } else {
+    return_str = "return ToPyObject(out);";
+  }
 
   std::string function_args = "";
   if (input_args == "") {
@@ -383,7 +411,8 @@ GenerateOpFunctions() {
       continue;
     }
     std::string func_name = "eager_api_" + op_type;
-    std::string op_function_str = GenerateOpFunctionsBody(op_proto, func_name);
+    std::string op_function_str =
+        GenerateOpFunctionsBody(op_proto, func_name, {});
 
     // generate pybind item
     auto bind_function_str = paddle::string::Sprintf(
@@ -391,6 +420,40 @@ GenerateOpFunctions() {
 
     op_function_list.emplace_back(std::move(op_function_str));
     bind_function_list.emplace_back(std::move(bind_function_str));
+
+    // NOTE(pangyoki): Inplace Strategy.
+    // In this case, output will reuse input varbase.
+    // Dygraph mode needs to be aligned with the in-place strategy in static
+    // mode, and the mapping relationships between output and input that have
+    // been defined in static mode should be used in dygraph mode.
+    // Find which ops need to use Inplace strategy in static mode, and get the
+    // mapping relationship between Inplace output and input.
+    auto& infer_inplace =
+        paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_;
+    std::map<std::string, std::string> inplace_map;
+    // `sum` op has duplicate input. Don't consider adding inplace strategy
+    // for `sum` in temporary.
+    if (op_type != "sum" && infer_inplace) {
+      // Inplace OP: op_type_.
+      // The inplace OP needs a new implementation method.
+      auto in_to_outs = infer_inplace(true);
+      for (auto& inplace_pair : in_to_outs) {
+        inplace_map[inplace_pair.second] = inplace_pair.first;
+      }
+
+      std::string inplace_op_type = op_type + "_";
+      std::string inplace_func_name = "eager_api_" + inplace_op_type;
+      std::string inplace_op_function_str =
+          GenerateOpFunctionsBody(op_proto, inplace_func_name, inplace_map);
+
+      // generate pybind item
+      auto inplace_bind_function_str =
+          paddle::string::Sprintf(PYBIND_ITEM_TEMPLATE, inplace_op_type,
+                                  inplace_func_name, inplace_op_type);
+
+      op_function_list.emplace_back(std::move(inplace_op_function_str));
+      bind_function_list.emplace_back(std::move(inplace_bind_function_str));
+    }
   }
   if (append_custom_head_file) {
     op_function_list.emplace_back(CUSTOM_HANDWRITE_OP_FUNC_FILE);
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 2e1390cb96155..ff8980d727e70 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -72,7 +72,7 @@ PyObject* tensor_properties_get_grad(TensorObject* self, void* closure) {
   EAGER_TRY
   VLOG(6) << "Get grad for tensor: " << self->tensor.name();
   auto meta = egr::EagerUtils::nullable_autograd_meta(self->tensor);
-  if (meta) {
+  if (meta && meta->Grad().initialized()) {
     return ToPyObject(meta->Grad());
   } else {
     Py_INCREF(Py_None);
@@ -96,7 +96,7 @@ int tensor_properties_set_grad(TensorObject* self, PyObject* value,
                      "Detected NULL grad"
                      "Please check if you have manually cleared"
                      "the grad inside autograd_meta"));
-  grad->copy_(src, true);
+  grad->copy_(src, self->tensor.inner_place(), true);
   return 0;
   EAGER_CATCH_AND_THROW_RETURN_ZERO
 }
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index f4e148cf8dceb..bac5ea115488e 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -27,6 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/op_function_common.h"
 #include "paddle/fluid/pybind/tensor_py.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
@@ -46,6 +47,7 @@ extern PyTypeObject* g_npuplace_pytype;
 extern PyTypeObject* g_cudapinnedplace_pytype;
 extern PyTypeObject* g_framework_tensor_pytype;
 extern PyTypeObject* g_framework_lodtensorarray_pytype;
+extern PyTypeObject* g_custom_op_kernel_ctx_pytype;
 
 int TensorDtype2NumpyDtype(phi::DataType dtype) {
   switch (dtype) {
@@ -61,6 +63,8 @@ int TensorDtype2NumpyDtype(phi::DataType dtype) {
       return pybind11::detail::npy_api::NPY_INT32_;
     case phi::DataType::INT64:
       return pybind11::detail::npy_api::NPY_INT64_;
+    case phi::DataType::BFLOAT16:
+      return pybind11::detail::NPY_UINT16_;
     case phi::DataType::FLOAT16:
       return pybind11::detail::NPY_FLOAT16_;
     case phi::DataType::FLOAT32:
@@ -184,7 +188,7 @@ paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) {
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "argument (position %d) must be "
-        "EagerVariable, but got %s",
+        "Tensor, but got %s",
         arg_pos + 1, reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
   }
 }
@@ -319,7 +323,7 @@ framework::Tensor CastPyArg2FrameworkTensor(PyObject* obj, ssize_t arg_pos) {
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "argument (position %d) must be "
-        "EagerVariable, but got %s",
+        "DenseTensor, but got %s",
         arg_pos + 1, reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
   }
 }
@@ -391,6 +395,19 @@ paddle::framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,
   return dtype;
 }
 
+paddle::CustomOpKernelContext CastPyArg2CustomOpKernelContext(PyObject* obj,
+                                                              ssize_t arg_pos) {
+  if (PyObject_IsInstance(
+          obj, reinterpret_cast<PyObject*>(g_custom_op_kernel_ctx_pytype))) {
+    return ::pybind11::handle(obj).cast<paddle::CustomOpKernelContext>();
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "one of(Place,CUDAPlace,CPUPlace,XPUPlace,NPUPlace,CUDAPinnedPlace), "
+        "but got %s",
+        arg_pos + 1, reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
+  }
+}
 PyObject* ToPyObject(bool value) {
   if (value) {
     Py_INCREF(Py_True);
@@ -403,6 +420,8 @@ PyObject* ToPyObject(bool value) {
 
 PyObject* ToPyObject(int value) { return PyLong_FromLong(value); }
 
+PyObject* ToPyObject(uint32_t value) { return PyLong_FromUnsignedLong(value); }
+
 PyObject* ToPyObject(int64_t value) { return PyLong_FromLongLong(value); }
 
 PyObject* ToPyObject(float value) { return PyLong_FromDouble(value); }
@@ -428,6 +447,20 @@ PyObject* ToPyObject(const paddle::experimental::Tensor& value) {
   return obj;
 }
 
+PyObject* ToPyObject(const paddle::experimental::Tensor& value,
+                     ssize_t value_idx, PyObject* args, ssize_t arg_idx) {
+  // For inplace op, directly return the input PyObject of the inplace tensor.
+  // [Parameter]
+  // value: Useless parameter.
+  // value_idx: Useless parameter.
+  // args: Input PyObject.
+  // arg_idx: Index of inplace PyObject in input args. Used to find the input
+  // inplace PyObject.
+  PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
+  Py_INCREF(obj);
+  return obj;
+}
+
 PyObject* ToPyObject(const std::vector<bool>& value) {
   PyObject* result = PyList_New((Py_ssize_t)value.size());
 
@@ -478,20 +511,26 @@ PyObject* ToPyObject(const std::vector<double>& value) {
   return result;
 }
 
-PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value) {
+PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value,
+                     bool return_py_none_if_not_initialize) {
   PyObject* result = PyList_New((Py_ssize_t)value.size());
 
   for (size_t i = 0; i < value.size(); i++) {
-    PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0);
-    if (obj) {
-      auto v = reinterpret_cast<TensorObject*>(obj);
-      new (&(v->tensor)) paddle::experimental::Tensor();
-      v->tensor = value[i];
+    if (!value[i].initialized() && return_py_none_if_not_initialize) {
+      Py_INCREF(Py_None);
+      PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), Py_None);
     } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "tp_alloc return null, can not new a PyObject."));
+      PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0);
+      if (obj) {
+        auto v = reinterpret_cast<TensorObject*>(obj);
+        new (&(v->tensor)) paddle::experimental::Tensor();
+        v->tensor = value[i];
+      } else {
+        PADDLE_THROW(platform::errors::Fatal(
+            "tp_alloc return null, can not new a PyObject."));
+      }
+      PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), obj);
     }
-    PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), obj);
   }
 
   return result;
@@ -928,6 +967,5 @@ paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,
   framework::proto::VarType::Type type = CastPyArg2ProtoType(obj, arg_pos);
   return framework::TransToPhiDataType(type);
 }
-
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 966a920377b38..fba1485bcf44e 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -20,10 +20,10 @@ limitations under the License. */
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 namespace paddle {
+class CustomOpKernelContext;
 namespace framework {
 class Scope;
 }
-
 namespace pybind {
 
 typedef struct {
@@ -40,6 +40,8 @@ int CastPyArg2AttrInt(PyObject* obj, ssize_t arg_pos);
 int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos);
 float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos);
 std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos);
+paddle::CustomOpKernelContext CastPyArg2CustomOpKernelContext(PyObject* obj,
+                                                              ssize_t arg_pos);
 paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos);
 std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
                                                        ssize_t arg_pos);
@@ -52,7 +54,9 @@ std::vector<framework::LoDTensor> CastPyArg2VectorOfTensorBase(PyObject* obj,
 std::vector<int> CastPyArg2VectorOfInt(PyObject* obj, size_t arg_pos);
 framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,
                                                     ssize_t arg_pos);
+
 PyObject* ToPyObject(int value);
+PyObject* ToPyObject(uint32_t value);
 PyObject* ToPyObject(bool value);
 PyObject* ToPyObject(int64_t value);
 PyObject* ToPyObject(float value);
@@ -60,12 +64,15 @@ PyObject* ToPyObject(double value);
 PyObject* ToPyObject(const char* value);
 PyObject* ToPyObject(const std::string& value);
 PyObject* ToPyObject(const paddle::experimental::Tensor& value);
+PyObject* ToPyObject(const paddle::experimental::Tensor& value,
+                     ssize_t value_idx, PyObject* args, ssize_t arg_idx);
 PyObject* ToPyObject(const std::vector<bool>& value);
 PyObject* ToPyObject(const std::vector<int>& value);
 PyObject* ToPyObject(const std::vector<int64_t>& value);
 PyObject* ToPyObject(const std::vector<float>& value);
 PyObject* ToPyObject(const std::vector<double>& value);
-PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value);
+PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value,
+                     bool return_py_none_if_not_initialize = false);
 PyObject* ToPyObject(const platform::Place& value);
 PyObject* ToPyObject(const framework::LoDTensor* value);
 PyObject* ToPyObject(const paddle::framework::proto::VarType::Type& dtype);
@@ -80,6 +87,17 @@ struct TupleTensorResult {
     TupleTensorResult<Tuple, N - 1>::Run(out, result);
     PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out)));
   }
+
+  static void Run(const Tuple& out, PyObject* result, ssize_t value_idx,
+                  PyObject* args, ssize_t arg_idx) {
+    TupleTensorResult<Tuple, N - 1>::Run(out, result, value_idx, args, arg_idx);
+    if (N - 1 == value_idx) {
+      PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out),
+                                                 value_idx, args, arg_idx));
+    } else {
+      PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out)));
+    }
+  }
 };
 
 template <typename Tuple>
@@ -87,6 +105,16 @@ struct TupleTensorResult<Tuple, 1> {
   static void Run(const Tuple& out, PyObject* result) {
     PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out)));
   }
+
+  static void Run(const Tuple& out, PyObject* result, ssize_t value_idx,
+                  PyObject* args, ssize_t arg_idx) {
+    if (value_idx == 0) {
+      PyTuple_SET_ITEM(result, 0,
+                       ToPyObject(std::get<0>(out), value_idx, args, arg_idx));
+    } else {
+      PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out)));
+    }
+  }
 };
 
 template <typename... Args>
@@ -99,6 +127,26 @@ PyObject* ToPyObject(const std::tuple<Args...>& out) {
   return result;
 }
 
+template <typename... Args>
+PyObject* ToPyObject(const std::tuple<Args...>& out, ssize_t value_idx,
+                     PyObject* args, ssize_t arg_idx) {
+  // For inplace op, directly return the input PyObject of the inplace tensor.
+  // [Parameter]
+  // out: Outputs tuple after executing op.
+  // value_idx: Index of inplace tensor in outputs tuple. Used to find the
+  // output inplace tensor.
+  // args: Input PyObject.
+  // arg_idx: Index of inplace PyObject in input args. Used to find the input
+  // inplace PyObject.
+  auto len = sizeof...(Args);
+  PyObject* result = PyTuple_New(len);
+
+  TupleTensorResult<decltype(out), sizeof...(Args)>::Run(out, result, value_idx,
+                                                         args, arg_idx);
+
+  return result;
+}
+
 paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
                                               const std::string& op_type,
                                               ssize_t arg_pos);
@@ -138,6 +186,7 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
     ssize_t arg_idx, bool dispensable = false);
 
 // end of Slice related methods
+
 std::vector<paddle::framework::Scope*> GetScopePtrListFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable);
diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc
index 362a3e44fab62..4f25a6f1a5ca8 100644
--- a/paddle/fluid/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/pybind/exception.h"
-
+#include "paddle/phi/api/ext/exception.h"
 namespace paddle {
 namespace pybind {
 
@@ -122,6 +122,8 @@ void ThrowExceptionToPython(std::exception_ptr p) {
         PyErr_SetString(EnforceNotMetException, e.what());
         break;
     }
+  } catch (const paddle::PD_Exception& e) {
+    PyErr_SetString(PyExc_OSError, e.what());
   }
 }
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index 3145a9cf7655c..01dae420cc6ab 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -225,7 +225,7 @@ void BindGraphPyClient(py::module* m) {
       .def("stop_server", &GraphPyClient::stop_server)
       .def("get_node_feat",
            [](GraphPyClient& self, std::string node_type,
-              std::vector<uint64_t> node_ids,
+              std::vector<int64_t> node_ids,
               std::vector<std::string> feature_names) {
              auto feats =
                  self.get_node_feat(node_type, node_ids, feature_names);
@@ -239,7 +239,7 @@ void BindGraphPyClient(py::module* m) {
            })
       .def("set_node_feat",
            [](GraphPyClient& self, std::string node_type,
-              std::vector<uint64_t> node_ids,
+              std::vector<int64_t> node_ids,
               std::vector<std::string> feature_names,
               std::vector<std::vector<py::bytes>> bytes_feats) {
              std::vector<std::vector<std::string>> feats(bytes_feats.size());
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 9b373a58181f1..3a2c93309f344 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -52,11 +52,13 @@ limitations under the License. */
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/op_function.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
 #include "paddle/fluid/pybind/slice_utils.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
+#include "paddle/phi/core/compat/type_defs.h"
 
 namespace paddle {
 namespace pybind {
@@ -117,7 +119,11 @@ class PyVariableWrapperHook : public imperative::VariableWrapperHook {
       return var;
     }
 
-    return PyObjectCast<std::shared_ptr<imperative::VarBase>>(res)->SharedVar();
+    auto res_varbase = PyObjectCast<std::shared_ptr<imperative::VarBase>>(res);
+    // Here the reference count of `res` is 2, so we decreases the reference
+    // count manually to avoid memory leaks
+    Py_DECREF(res);
+    return res_varbase->SharedVar();
   }
 
  private:
@@ -436,6 +442,28 @@ static imperative::NameVarBaseMap ConvertToNameVarBaseMap(
   return result;
 }
 
+paddle::imperative::NameTensorMap ConvertToNameTensorMap(
+    const PyNameVarBaseMap &map) {
+  paddle::imperative::NameTensorMap result;
+  for (auto &pair : map) {
+    auto var_vec = CastPyArg2VectorOfTensor(pair.second.ptr(), 0);
+    if (!var_vec.empty()) {
+      // change vector<Tensor> -> vector<shared_ptr<Tensor>>
+      std::vector<std::shared_ptr<egr::EagerVariable>> dst_var_vec;
+      for (auto &v : var_vec) {
+        dst_var_vec.emplace_back(
+            std::make_shared<egr::EagerVariable>(std::move(v)));
+      }
+      result.emplace(pair.first, std::move(dst_var_vec));
+    }
+  }
+
+  PADDLE_ENFORCE_EQ(
+      PyErr_Occurred(), nullptr,
+      platform::errors::InvalidArgument(py::str(py::handle(PyErr_Occurred()))));
+  return result;
+}
+
 template <typename P>
 static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src,  // NOLINT
                         imperative::VarBase &dst,                   // NOLINT
@@ -2079,8 +2107,8 @@ void BindImperative(py::module *m_ptr) {
               const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
               framework::AttributeMap attrs) {
              // TODO(xiongkun): move this function outside of tracer.
-             auto ins_map = ConvertToNameVarBaseMap(ins);
-             auto outs_map = ConvertToNameVarBaseMap(outs);
+             auto ins_map = ConvertToNameTensorMap(ins);
+             auto outs_map = ConvertToNameTensorMap(outs);
              {
                auto to_vector = [](paddle::SmallVector<std::string> &vec) {
                  return std::vector<std::string>(vec.begin(), vec.end());
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 9b5041154c95a..c8f0acd0b8a85 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -33,6 +33,10 @@
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/inference/utils/io_utils.h"
 
+#ifdef PADDLE_WITH_ONNXRUNTIME
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+#endif
+
 namespace py = pybind11;
 
 namespace pybind11 {
@@ -547,6 +551,9 @@ void BindAnalysisConfig(py::module *m) {
       .def("params_file", &AnalysisConfig::params_file)
       .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu,
            py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0)
+      .def("exp_enable_use_gpu_fp16", &AnalysisConfig::Exp_EnableUseGpuFp16,
+           py::arg("gpu_fp16_disabled_op_types") =
+               std::unordered_set<std::string>({}))
       .def("enable_xpu", &AnalysisConfig::EnableXpu,
            py::arg("l3_workspace_size") = 16 * 1024 * 1024,
            py::arg("locked") = false, py::arg("autotune") = true,
@@ -556,6 +563,10 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("device_id") = 0)
       .def("enable_npu", &AnalysisConfig::EnableNpu, py::arg("device_id") = 0)
       .def("disable_gpu", &AnalysisConfig::DisableGpu)
+      .def("enable_onnxruntime", &AnalysisConfig::EnableONNXRuntime)
+      .def("disable_onnxruntime", &AnalysisConfig::DisableONNXRuntime)
+      .def("onnxruntime_enabled", &AnalysisConfig::use_onnxruntime)
+      .def("enable_ort_optimization", &AnalysisConfig::EnableORTOptimization)
       .def("use_gpu", &AnalysisConfig::use_gpu)
       .def("use_xpu", &AnalysisConfig::use_xpu)
       .def("use_npu", &AnalysisConfig::use_npu)
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index bb45c1c40603f..ecbacd37d5666 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -143,6 +143,7 @@ void BindNode(py::module *m) {
       .def("var", &Node::Var, return_value_policy::reference)
       .def("op", &Node::Op, return_value_policy::reference)
       .def("id", &Node::id)
+      .def("graph_id", &Node::GraphId)
       .def("original_desc_id", &Node::OriginalDescId)
       .def("is_op", &Node::IsOp)
       .def("is_var", &Node::IsVar)
diff --git a/paddle/fluid/pybind/kernel_signature_generator.cc b/paddle/fluid/pybind/kernel_signature_generator.cc
index 8d78adaf5a473..1520174fba288 100644
--- a/paddle/fluid/pybind/kernel_signature_generator.cc
+++ b/paddle/fluid/pybind/kernel_signature_generator.cc
@@ -46,10 +46,19 @@ int main(int argc, char **argv) {
   auto &kernel_factory = phi::KernelFactory::Instance();
   std::string kernel_signature_map_str{"{"};
   for (const auto &op_kernel_pair : kernel_factory.kernels()) {
-    if (kernel_signature_map.Has(op_kernel_pair.first)) {
+    std::string op_name = op_kernel_pair.first;
+    const paddle::flat_hash_map<std::string, std::string> &kernel_name_map =
+        phi::OpUtilsMap::Instance().base_kernel_name_map();
+    for (auto &it : kernel_name_map) {
+      if (it.second == op_name) {
+        op_name = it.first;
+        break;
+      }
+    }
+    if (kernel_signature_map.Has(op_name)) {
       kernel_signature_map_str =
           kernel_signature_map_str + "\"" + op_kernel_pair.first + "\":{";
-      auto &args = kernel_signature_map.Get(op_kernel_pair.first).args;
+      auto &args = kernel_signature_map.Get(op_name).args;
 
       kernel_signature_map_str += "\"inputs\":[";
       auto inputs_ = std::get<0>(args);
diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc
index 09c3cea398b2a..1d483abd7746c 100644
--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -854,5 +854,30 @@ void InitOpsAttrTypeMap() {
   }
 }
 
+ssize_t GetIdxFromCoreOpsInfoMap(
+    const std::unordered_map<std::string, std::vector<std::string>>&
+        core_ops_info_map,
+    const std::string& op_type, const std::string& name) {
+  // `core_ops_info_map` can be `core_ops_args_info` or `core_ops_returns_info`.
+  // `core_ops_args_info`: get index from core_ops_args_info[op_type] according
+  // to input name.
+  // `core_ops_returns_info`: get index from core_ops_returns_info[op_type]
+  // according to return name.
+  if (!core_ops_info_map.count(op_type)) {
+    PADDLE_THROW(platform::errors::Fatal(
+        "Op %s is not found in core_ops_*_info map.", op_type));
+  } else {
+    auto args_list = core_ops_info_map.at(op_type);
+    auto it = std::find(args_list.begin(), args_list.end(), name);
+    if (it == args_list.end()) {
+      PADDLE_THROW(platform::errors::Fatal("%s is not found in op %s's args.",
+                                           name, op_type));
+    } else {
+      return std::distance(args_list.begin(), it);
+    }
+  }
+  return -1;
+}
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function_common.h b/paddle/fluid/pybind/op_function_common.h
index 7ead985266725..33d0e242a027d 100644
--- a/paddle/fluid/pybind/op_function_common.h
+++ b/paddle/fluid/pybind/op_function_common.h
@@ -146,5 +146,10 @@ unsigned long GetUnsignedLongFromArgs(  // NOLINT
 
 void InitOpsAttrTypeMap();
 
+ssize_t GetIdxFromCoreOpsInfoMap(
+    const std::unordered_map<std::string, std::vector<std::string>>&
+        core_ops_info_map,
+    const std::string& op_type, const std::string& name);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index d23b3dd64ab05..d8750c1d6c115 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -30,8 +30,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"layer_norm", {"X", "Scale", "Bias"}},
     {"bincount", {"X", "Weights"}},
     {"fused_attention",
-     {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "SrcMask", "OutLinearW",
-      "OutLinearBias", "Ln2Scale", "Ln2Bias"}},
+     {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "CacheKV", "SrcMask",
+      "OutLinearW", "OutLinearBias", "Ln2Scale", "Ln2Bias"}},
     {"instance_norm", {"X", "Scale", "Bias"}},
     {"gru_unit", {"Input", "HiddenPrev", "Weight", "Bias"}},
     {"label_smooth", {"X", "PriorDist"}},
@@ -88,6 +88,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"nce",
      {"Input", "Label", "Weight", "Bias", "SampleWeight", "CustomDistProbs",
       "CustomDistAlias", "CustomDistAliasProbs"}},
+    {"check_finite_and_unscale", {"X", "Scale", "FloatStatus"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
@@ -104,11 +105,16 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"batch_norm",
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
-    {"fused_attention",
-     {"LnMean", "LnVariance", "LnOut", "QKVOut", "QKVBiasOut", "TransposeOut2",
-      "QKOut", "QKTVOut", "SoftmaxOut", "AttnDropoutMaskOut", "AttnDropoutOut",
-      "SrcMaskOut", "FMHAOut", "OutLinearOut", "DropoutMaskOut", "Ln2Mean",
-      "Ln2Variance", "BiasDropoutResidualOut", "Y"}},
+    {"fused_attention", {"LnMean",         "LnVariance",
+                         "LnOut",          "QKVOut",
+                         "QKVBiasOut",     "TransposeOut2",
+                         "QKOut",          "QKTVOut",
+                         "SoftmaxOut",     "AttnDropoutMaskOut",
+                         "AttnDropoutOut", "SrcMaskOut",
+                         "FMHAOut",        "OutLinearOut",
+                         "DropoutMaskOut", "Ln2Mean",
+                         "Ln2Variance",    "BiasDropoutResidualOut",
+                         "CacheKVOut",     "Y"}},
     {"sync_batch_norm",
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 5013f843cde30..6470a6885aca1 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -64,6 +64,9 @@ limitations under the License. */
 #include "paddle/fluid/imperative/amp_auto_cast.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
+#endif
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
@@ -111,6 +114,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/metrics_py.h"
 #include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
+#include "paddle/phi/backends/device_manager.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/pybind/nccl_wrapper_py.h"
@@ -161,6 +165,9 @@ limitations under the License. */
 #include "paddle/fluid/pybind/fleet_py.h"
 #endif
 
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
 #include "pybind11/stl.h"
 
 DECLARE_bool(use_mkldnn);
@@ -184,6 +191,7 @@ PyTypeObject *g_cudapinnedplace_pytype = nullptr;
 PyTypeObject *g_mluplace_pytype = nullptr;
 PyTypeObject *g_framework_tensor_pytype = nullptr;
 PyTypeObject *g_framework_lodtensorarray_pytype = nullptr;
+PyTypeObject *g_custom_op_kernel_ctx_pytype = nullptr;
 
 bool IsCompiledWithCUDA() {
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
@@ -729,6 +737,18 @@ PYBIND11_MODULE(core_noavx, m) {
                lib[string]: the libarary, could be 'phi', 'fluid' and 'all'.
            )DOC");
 
+  // NOTE(Aganlengzi): KernelFactory static instance is initialized BEFORE
+  // plugins are loaded for custom kernels, but de-initialized AFTER they are
+  // unloaded. We need manually clear symbols(may contain plugins' symbols)
+  // stored in this static instance to avoid illegal memory access.
+  m.def("clear_kernel_factory",
+        []() { phi::KernelFactory::Instance().kernels().clear(); });
+  m.def("clear_device_manager", []() {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    phi::DeviceManager::Clear();
+#endif
+  });
+
   // NOTE(zjl): ctest would load environment variables at the beginning even
   // though we have not `import paddle.fluid as fluid`. So we add this API
   // to enable eager deletion mode in unittest.
@@ -747,6 +767,57 @@ PYBIND11_MODULE(core_noavx, m) {
   m.def("_promote_types_if_complex_exists",
         &paddle::framework::PromoteTypesIfComplexExists);
 
+  py::class_<paddle::CustomOpKernelContext> custom_op_kernel_ctx(
+      m, "CustomOpKernelContext", R"DOC()DOC");
+  g_custom_op_kernel_ctx_pytype =
+      reinterpret_cast<PyTypeObject *>(custom_op_kernel_ctx.ptr());
+  custom_op_kernel_ctx.def(py::init<>())
+      .def("add_inputs",
+           [](paddle::CustomOpKernelContext &self, const py::handle &input) {
+             PyObject *obj = input.ptr();
+             if (PyList_Check(obj) || PyTuple_Check(obj)) {
+               self.EmplaceBackInputs(
+                   std::move(CastPyArg2VectorOfTensor(obj, 1)));
+             } else {
+               self.EmplaceBackInput(std::move(CastPyArg2Tensor(obj, 1)));
+             }
+           })
+      .def("add_outputs",
+           [](paddle::CustomOpKernelContext &self, py::handle &outputs) {
+             PyObject *obj = outputs.ptr();
+             if (PyList_Check(obj) || PyTuple_Check(obj)) {
+               self.EmplaceBackOutputs(
+                   std::move(CastPyArg2VectorOfTensor(obj, 1)));
+             } else {
+               self.EmplaceBackOutput(std::move(CastPyArg2Tensor(obj, 1)));
+             }
+           })
+      .def("add_attr", [](paddle::CustomOpKernelContext &self,
+                          bool attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr", [](paddle::CustomOpKernelContext &self,
+                          int attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr", [](paddle::CustomOpKernelContext &self,
+                          float attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr", [](paddle::CustomOpKernelContext &self,
+                          int64_t attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr",
+           [](paddle::CustomOpKernelContext &self, const std::string &attr) {
+             self.EmplaceBackAttr(attr);
+           })
+      .def("add_attr",
+           [](paddle::CustomOpKernelContext &self,
+              const std::vector<int> &attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr",
+           [](paddle::CustomOpKernelContext &self,
+              const std::vector<float> &attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr",
+           [](paddle::CustomOpKernelContext &self,
+              const std::vector<int64_t> &attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr", [](paddle::CustomOpKernelContext &self,
+                          const std::vector<std::string> &attr) {
+        self.EmplaceBackAttr(attr);
+      });
+
   py::class_<framework::Tensor> framework_tensor(m, "Tensor",
                                                  py::buffer_protocol());
   g_framework_tensor_pytype =
@@ -1180,6 +1251,287 @@ PYBIND11_MODULE(core_noavx, m) {
            });
 #else
            })
+#ifdef PADDLE_WITH_CUDA
+      .def("_share_buffer_with",
+           [](framework::Tensor &self, const framework::Tensor src,
+              py::tuple t) {
+             auto *cuda_ipc_allocation =
+                 dynamic_cast<memory::allocation::CudaIpcAllocation *>(
+                     src.Holder().get());
+
+             PADDLE_ENFORCE_NOT_NULL(
+                 cuda_ipc_allocation,
+                 platform::errors::PreconditionNotMet(
+                     "Tensor is not Cuda IPC shared tensor. "
+                     "Now only Tensor shared by cuda ipc could use this "
+                     "api."));
+
+             size_t size = t[0].cast<size_t>();
+             auto dtype =
+                 static_cast<paddle::experimental::DataType>(t[1].cast<int>());
+             auto dims = phi::make_ddim(t[2].cast<std::vector<int>>());
+             auto lod_info = t[3].cast<framework::LoD>();
+             auto device_id = t[4].cast<int>();
+
+             auto shared_reader_holder =
+                 std::make_shared<memory::allocation::Allocation>(
+                     cuda_ipc_allocation->ptr(),
+                     cuda_ipc_allocation->base_ptr(), size,
+                     platform::CUDAPlace(device_id));
+
+             self.ResetHolderWithType(shared_reader_holder, dtype);
+             self.Resize(dims);
+             self.set_lod(lod_info);
+
+             VLOG(6) << "Reconstructed tensor with buffer shared!";
+           },
+           R"DOC(
+           Deserialize GPU Tensor for existed shared Cuda IPC tensor.
+
+           Params:
+               tensor: Shared Cuda IPC tensor.
+               tuple: contrains data size, data type,
+                      tensor dims, lod information, device index.
+
+       )DOC")
+      .def("_share_cuda",
+           [](framework::Tensor self) {
+             if (!self.IsInitialized() || self.numel() == 0)
+               throw std::runtime_error(
+                   "Tensor not initialized or numel is 0.  could not pass "
+                   "to shared memory. ");
+
+             auto *holder = dynamic_cast<memory::allocation::Allocation *>(
+                 self.Holder().get());
+             PADDLE_ENFORCE_EQ(
+                 platform::is_gpu_place(holder->place()), true,
+                 platform::errors::InvalidArgument(
+                     "Tensor is not on GPU. share_cuda only support GPU "
+                     "Tensor, share_filename is for CPU tensor."));
+
+             void *base_ptr = holder->base_ptr();
+             ptrdiff_t offset_bytes = reinterpret_cast<char *>(holder->ptr()) -
+                                      reinterpret_cast<char *>(base_ptr);
+
+             cudaIpcMemHandle_t handle;
+             PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcGetMemHandle(&handle, base_ptr));
+
+             auto _handle = py::bytes(reinterpret_cast<char *>(&handle),
+                                      (py::ssize_t)CUDA_IPC_HANDLE_SIZE);
+
+             // TODO(ZHUI): use cuda event, to avoid sync.
+             const auto &device_id = paddle::platform::GetCurrentDeviceId();
+             auto stream =
+                 paddle::platform::stream::get_current_stream(device_id);
+             stream->Synchronize();
+
+             int type_idx = static_cast<int>(self.type());
+             size_t data_size =
+                 self.numel() *
+                 framework::SizeOfType(
+                     framework::TransToProtoVarType(self.type()));
+
+             return py::make_tuple(_handle, (py::size_t)offset_bytes, data_size,
+                                   type_idx, vectorize(self.dims()), self.lod(),
+                                   device_id);
+           },
+           R"DOC(
+           Serialize GPU Tensor by cudaIpcMemHandle.
+
+           Returns:
+               tuple: contrains handle, data size, data type,
+                      tensor dims, lod information, device index.
+
+           Examples:
+               .. code-block:: python
+
+                 import paddle
+                 tensor = paddle.ones([3,3])
+                 metainfo = tensor.value().get_tensor()._share_cuda()
+
+      )DOC")
+      .def("_new_shared_cuda",
+           [](py::tuple t) {
+             if (t.size() != 7)
+               throw std::runtime_error(
+                   "Invalid Tensor meta info for shared cuda tensor!");
+
+             // 1. Create a new C++ instance
+             framework::Tensor tensor;
+
+             // 2. Rebuild Allocation from handle
+             const std::string &handle = t[0].cast<std::string>();
+             ptrdiff_t offset_bytes = (ptrdiff_t)t[1].cast<int64_t>();
+             auto device_id = t[6].cast<int>();
+             auto base_ptr = memory::allocation::GetIpcBasePtr(handle);
+             size_t size = t[2].cast<size_t>();
+             void *dev = base_ptr.get();
+             dev = reinterpret_cast<char *>(dev) + offset_bytes;
+
+             auto shared_reader_holder =
+                 std::make_shared<memory::allocation::CudaIpcAllocation>(
+                     dev, size, device_id, std::move(base_ptr));
+
+             // 3. Rebuild Tensor
+             tensor.ResetHolderWithType(
+                 shared_reader_holder,
+                 static_cast<paddle::experimental::DataType>(t[3].cast<int>()));
+             tensor.Resize(phi::make_ddim(t[4].cast<std::vector<int>>()));
+             tensor.set_lod(t[5].cast<framework::LoD>());
+
+             return tensor;
+           },
+           R"DOC(
+           Deserialize GPU lod tensor from cudaIpcMemHandle.
+
+           Params:
+               tuple: contrains handle, data size, data type,
+                      tensor dims, lod information, device index.
+
+           Examples:
+               .. code-block:: python
+
+                 import paddle
+                 tensor = paddle.ones([3,3])
+                 metainfo = tensor.value().get_tensor()._share_cuda()
+                 tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_cuda(metainfo))
+
+        )DOC")
+#endif
+      .def("_share_filename",
+           [](framework::Tensor &self) {
+             if (!self.IsInitialized() || self.numel() == 0)
+               throw std::runtime_error(
+                   "Tensor not initialized or numel is 0. could not pass to "
+                   "shared memory. ");
+
+             auto holder = self.Holder();
+             PADDLE_ENFORCE_EQ(
+                 platform::is_cpu_place(holder->place()) ||
+                     platform::is_cuda_pinned_place(holder->place()),
+                 true, platform::errors::InvalidArgument(
+                           "Tensor is not on CPU. share_filename only "
+                           "support CPU Tensor."));
+
+             auto *mmap_allocation = dynamic_cast<
+                 memory::allocation::RefcountedMemoryMapAllocation *>(
+                 holder.get());
+             // If the tensor is not shared, allocate memory map allocation.
+             if (mmap_allocation == nullptr) {
+               void *data_ptr = self.data();
+               size_t data_size =
+                   self.numel() *
+                   framework::SizeOfType(
+                       framework::TransToProtoVarType(self.type()));
+
+               int flags = memory::allocation::MAPPED_SHAREDMEM |
+                           memory::allocation::MAPPED_EXCLUSIVE;
+               std::string handle = memory::allocation::GetIPCName();
+               auto shared_holder =
+                   memory::allocation::AllocateRefcountedMemoryMapAllocation(
+                       handle, flags, data_size);
+
+               // copy data & reset holder
+               if (platform::is_cuda_pinned_place(holder->place())) {
+#ifdef PADDLE_WITH_CUDA
+                 memory::Copy(platform::CPUPlace(), shared_holder->ptr(),
+                              platform::CUDAPinnedPlace(), data_ptr, data_size);
+#endif
+               } else {
+                 memory::Copy(platform::CPUPlace(), shared_holder->ptr(),
+                              platform::CPUPlace(), data_ptr, data_size);
+               }
+               self.ResetHolder(shared_holder);
+               mmap_allocation = shared_holder.get();
+             }
+             int type_idx = static_cast<int>(self.type());
+
+             return py::make_tuple(mmap_allocation->ipc_name(),
+                                   mmap_allocation->size(), type_idx,
+                                   vectorize(self.dims()), self.lod());
+           },
+           R"DOC(
+           Serialize CPU lod tensor in shared memory to tuple.
+           If the tensor is not in shared memory, we will copy it first.
+
+           Returns:
+               tuple: contrains ipc name, data size, data type,
+                      tensor dims and lod imformation.
+
+           Examples:
+               .. code-block:: python
+
+                 import paddle
+                 tensor = paddle.ones([3,3])
+                 metainfo = tensor.value().get_tensor()._share_filename()
+
+       )DOC")
+      .def("_new_shared_filename",
+           [](py::tuple t) {  // __setstate__
+             if (t.size() != 5)
+               throw std::runtime_error("Invalid Tensor meta info state!");
+
+             framework::Tensor tensor;
+
+             // 2. Rebuild Allocation
+             const std::string &ipc_name = t[0].cast<std::string>();
+             size_t size = t[1].cast<size_t>();
+             int flags = memory::allocation::MAPPED_SHAREDMEM |
+                         memory::allocation::MAPPED_NOCREATE;
+
+             auto shared_holder =
+                 memory::allocation::AllocateRefcountedMemoryMapAllocation(
+                     ipc_name, flags, size);
+
+             // 3. Rebuild Tensor
+             tensor.ResetHolderWithType(
+                 shared_holder,
+                 static_cast<paddle::experimental::DataType>(t[2].cast<int>()));
+             tensor.Resize(phi::make_ddim(t[3].cast<std::vector<int>>()));
+             tensor.set_lod(t[4].cast<framework::LoD>());
+
+             return tensor;
+           },
+           R"DOC(
+           Deserialize CPU lod tensor from shared memory.
+
+           Params:
+               tuple: contrains ipc file name, data size, data type,
+                      tensor dims and lod information.
+
+           Examples:
+               .. code-block:: python
+
+                 import paddle
+                 tensor = paddle.ones([3,3])
+                 metainfo = tensor.value().get_tensor()._share_filename()
+                 tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_filename(metainfo))
+
+        )DOC")
+      .def("_shared_incref",
+           [](framework::Tensor &self) {
+             auto *mmap_allocation = dynamic_cast<
+                 memory::allocation::RefcountedMemoryMapAllocation *>(
+                 self.Holder().get());
+             if (mmap_allocation) {
+               mmap_allocation->incref();
+             }
+           },
+           R"DOC(
+            Increase reference count of share_filename tensor.
+      )DOC")
+      .def("_shared_decref",
+           [](framework::Tensor &self) {
+             auto *mmap_allocation = dynamic_cast<
+                 memory::allocation::RefcountedMemoryMapAllocation *>(
+                 self.Holder().get());
+             if (mmap_allocation) {
+               mmap_allocation->decref();
+             }
+           },
+           R"DOC(
+            Decrease reference count of share_filename tensor.
+      )DOC")
       .def(py::pickle(
           [](const framework::Tensor &t) {  // __getstate__
             auto holder = t.Holder();
@@ -1957,10 +2309,17 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
   m.def("get_xpu_device_version",
         [](int device_id) { return platform::get_xpu_version(device_id); });
+#ifdef PADDLE_WITH_XPU_KP
+  m.def("get_xpu_device_op_support_types",
+        [](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
+          return platform::get_xpu_kp_op_support_type(op_name, version);
+        });
+#else
   m.def("get_xpu_device_op_support_types",
         [](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
           return platform::get_xpu_op_support_type(op_name, version);
         });
+#endif
   m.def("get_xpu_device_op_list", [](phi::backends::xpu::XPUVersion version) {
     return platform::get_xpu_op_list(version);
   });
@@ -2529,10 +2888,11 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("init_gflags", framework::InitGflags);
   m.def("init_glog", framework::InitGLOG);
-  m.def("load_op_meta_info_and_register_op",
-        framework::LoadOpMetaInfoAndRegisterOp);
+  m.def("load_op_meta_info_and_register_op", [](const std::string dso_name) {
+    egr::Controller::Instance().MergeOpMetaInfoMap(
+        framework::LoadOpMetaInfoAndRegisterOp(dso_name));
+  });
   m.def("init_devices", []() { framework::InitDevices(); });
-
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
   m.def("is_compiled_with_ascend", IsCompiledWithAscend);
   m.def("is_compiled_with_rocm", IsCompiledWithROCM);
@@ -3905,6 +4265,7 @@ All parameter, weight, gradient are variables in Paddle.
                  platform::ipu::IpuBackend::GetInstance());
            },
            py::return_value_policy::reference)
+      .def("weights_to_host", &platform::ipu::IpuBackend::WeightsToHost)
       .def("detach", &platform::ipu::IpuBackend::Detach)
       .def("reset", &platform::ipu::IpuBackend::Reset)
       .def("set_scope", &platform::ipu::IpuBackend::SetScope)
@@ -3952,6 +4313,15 @@ All parameter, weight, gradient are variables in Paddle.
                          option_name, option.first.cast<std::string>(),
                          option.second.cast<std::uint64_t>());
                    }
+                 } else if (option_name == "accumulate_outer_fragment") {
+                   for (auto option : element.second.cast<py::dict>()) {
+                     std::vector<int> values;
+                     for (auto value : option.second.cast<py::list>()) {
+                       values.push_back(value.cast<int>());
+                     }
+                     self.SetAccumulateOuterFragmentSettings(
+                         option.first.cast<std::uint64_t>(), values);
+                   }
                  } else if (option_name == "custom_op") {
                    std::string paddle_op;
                    std::string popart_op;
diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h
index a037fa13eb53b..add332abd30ea 100644
--- a/paddle/fluid/pybind/slice_utils.h
+++ b/paddle/fluid/pybind/slice_utils.h
@@ -188,16 +188,14 @@ static void ParseIndexingSlice(
       int start = static_cast<int>(PyLong_AsLong(slice_item));
       auto s_t = start;
       start = start < 0 ? start + dim_len : start;
-      if (start >= dim_len || start < 0) {
-        std::string str_error_message =
-            "The starting index " + std::to_string(s_t) +
-            " of slice is out of bounds in tensor " + std::to_string(dim) +
-            "-th axis, it shound be in the range of [" +
-            std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")";
-        // py::index_error is corresponding to IndexError in Python
-        // Used to indicate out of bounds access in __getitem__, __setitem__
-        throw py::index_error(str_error_message);
-      }
+
+      PADDLE_ENFORCE(
+          0 <= start && start < dim_len,
+          platform::errors::OutOfRange("The starting index %d of slice is out "
+                                       "of bounds in tensor %d-th axis, it "
+                                       "shound be in the range of [%d, %d).",
+                                       s_t, dim, -dim_len, dim_len));
+
       slice_axes->push_back(dim);
       slice_starts->push_back(start);
       slice_ends->push_back(start + 1);
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index c593c7df3e0ec..6849fcb039410 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -585,14 +585,20 @@ inline void _getSliceinfo(const framework::Tensor &self, py::object obj,
   auto &step = *pstep;
   auto &slicelength = *pslicelength;
   const framework::DDim &srcDDim = self.dims();
-  if (dim < 0 || dim >= srcDDim.size()) {
-    throw py::index_error();
-  }
+  PADDLE_ENFORCE(
+      0 <= dim && dim < srcDDim.size(),
+      platform::errors::OutOfRange("The dim %d of slice is out of bounds, it "
+                                   "shound be in the range of [0, %d).",
+                                   dim, srcDDim.size()));
+
   if (py::isinstance<py::slice>(obj)) {
     size_t lstart, lstop, lstep, lslicelength;
     py::slice s = static_cast<py::slice>(obj);
     if (!s.compute(srcDDim[dim], &lstart, &lstop, &lstep, &lslicelength)) {
-      throw py::index_error();
+      PADDLE_THROW(platform::errors::OutOfRange(
+          "Slice on dim: %d is error, please check the validity of tensor "
+          "dims or slice item.",
+          dim));
     }
     start = static_cast<int64_t>(lstart);
     stop = static_cast<int64_t>(lstop);
@@ -600,15 +606,19 @@ inline void _getSliceinfo(const framework::Tensor &self, py::object obj,
     slicelength = static_cast<int64_t>(lslicelength);
   } else if (py::isinstance<py::int_>(obj)) {
     start = static_cast<int64_t>(static_cast<py::int_>(obj));
-    if (std::abs(start) >= srcDDim[dim]) {
-      throw py::index_error();
-    }
+    PADDLE_ENFORCE(
+        std::abs(start) < srcDDim[dim],
+        platform::errors::OutOfRange("The start %d of slice is out of bounds, "
+                                     "it shound be in the range of (%d, %d).",
+                                     start, -srcDDim[dim], srcDDim[dim]));
     start = (start >= 0) ? start : srcDDim[dim] - start;
     stop = start + 1;
     step = 1;
     slicelength = 1;
   } else {
-    throw py::index_error();
+    PADDLE_THROW(
+        platform::errors::OutOfRange("Index object error, the index object for "
+                                     "slice only supports slice(::) and int."));
   }
 }
 
diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
index ed29b5b44c779..e777a8e3ab4e6 100644
--- a/paddle/infrt/CMakeLists.txt
+++ b/paddle/infrt/CMakeLists.txt
@@ -3,12 +3,22 @@ if (NOT WITH_INFRT)
 endif()
 
 option(INFRT_WITH_PHI  "Compile INFRT with PHI"    ON)
+option(INFRT_WITH_GPU  "Compile INFRT with GPU"    OFF)
+option(INFRT_WITH_TRT  "Compile INFRT with TensorRT"    OFF)
 
 #TODO(xiaowei) remove fluid
 include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
 
 if (INFRT_WITH_PHI)
-    add_definitions("-DINFRT_WITH_PHI")
+  add_definitions("-DINFRT_WITH_PHI")
+
+  # TODO(wilber): Now Infrt gpu/trt depends on phi's components, Modify compile dependency options later.
+  if (INFRT_WITH_GPU)
+    add_definitions("-DINFRT_WITH_GPU")
+    if (INFRT_WITH_TRT)
+      add_definitions("-DINFRT_WITH_TRT")
+    endif()
+  endif()
 endif()
 
 # compile flags
@@ -90,10 +100,8 @@ add_subdirectory(tests)
 set(infrt_mlir_incs
         basic_kernels_inc
         test_kernels_inc
-        infrt_base_inc
         tensor_shape_inc
         dense_tensor_inc
-        pd_ops_inc
         pd_extra_ops_inc
         trt_ops_inc
         )
@@ -107,6 +115,9 @@ if (INFRT_WITH_PHI)
 endif()
 
 cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto infrt_naive)
+if (INFRT_WITH_TRT)
+  target_link_libraries(infrt infrt_trt)
+endif()
 cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto)
 add_dependencies(infrt ${infrt_mlir_incs} mlir-headers)
 
diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc
index 28f63db49f4ba..5ac51fb671557 100644
--- a/paddle/infrt/api/infrt_api.cc
+++ b/paddle/infrt/api/infrt_api.cc
@@ -24,6 +24,7 @@
 
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
 #include "paddle/infrt/host_context/core_runtime.h"
 #include "paddle/infrt/host_context/kernel_registry.h"
@@ -41,7 +42,6 @@
 using namespace infrt::host_context;  // NOLINT
 using namespace infrt::tensor;        // NOLINT
 using namespace infrt::tensor;        // NOLINT
-using infrt::dt::TensorMapType;       // NOLINT
 
 namespace infrt {
 
@@ -129,7 +129,7 @@ class PredictExecutor : public MlirToRuntimeTranslator {
       auto arg = predict_func.getArgument(i);
       auto type = arg.getType();
       // this param is TensorMap
-      if (type.isa<TensorMapType>()) {
+      if (type.isa<infrt::DenseHostTensorMapType>()) {
         auto* value = new host_context::Value(std::move(*map));
         arguments_.push_back(value);
         AddValue(predict_func.getArgument(i), value);
@@ -144,7 +144,7 @@ class PredictExecutor : public MlirToRuntimeTranslator {
 
     // process results
     auto& last_op = predict_func.front().back();
-    if (last_op.getName().getStringRef() == "Infrt.return") {
+    if (last_op.getName().getStringRef() == "infrt.return") {
       for (size_t i = 0; i < last_op.getNumOperands(); ++i) {
         auto* value = AddValue(mlir::Value(last_op.getOperand(i)));
         results_.push_back(ValueRef(value));
diff --git a/paddle/infrt/backends/host/phi_allocator.h b/paddle/infrt/backends/host/phi_allocator.h
index c8f97e04a1b83..6e3bef9299162 100644
--- a/paddle/infrt/backends/host/phi_allocator.h
+++ b/paddle/infrt/backends/host/phi_allocator.h
@@ -13,6 +13,10 @@ limitations under the License. */
 
 #include "paddle/phi/core/allocator.h"
 
+#ifdef INFRT_WITH_GPU
+#include <cuda_runtime.h>
+#endif
+
 namespace infrt {
 namespace backends {
 
@@ -29,5 +33,22 @@ class CpuPhiAllocator : public phi::Allocator {
   }
 };
 
+#ifdef INFRT_WITH_GPU
+// TODO(wilber): Just for demo test. we need a more efficient gpu allocator.
+class GpuPhiAllocator : public phi::Allocator {
+ public:
+  static void deleter(phi::Allocation* ptr) { cudaFree(ptr->ptr()); }
+
+  AllocationPtr Allocate(size_t bytes_size) {
+    void* ptr;
+    cudaMalloc(&ptr, bytes_size);
+    return AllocationPtr(
+        new phi::Allocation(
+            ptr, bytes_size, phi::Place(phi::AllocationType::GPU)),
+        deleter);
+  }
+};
+#endif
+
 }  // namespace backends
 }  // namespace infrt
diff --git a/paddle/infrt/backends/host/phi_context.h b/paddle/infrt/backends/host/phi_context.h
index 9d0e3bc4fbb31..bcd63dbb39fe8 100644
--- a/paddle/infrt/backends/host/phi_context.h
+++ b/paddle/infrt/backends/host/phi_context.h
@@ -11,7 +11,9 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 
 namespace infrt {
 namespace backends {
@@ -20,6 +22,25 @@ class CpuPhiContext : public phi::CPUContext {
  public:
   using Base = phi::CPUContext;
   using phi::CPUContext::SetEigenDevice;
+
+  CpuPhiContext() {
+    Init();
+    SetAllocator(alloc_.get());
+  }
+
+ private:
+  std::unique_ptr<phi::Allocator> alloc_{std::make_unique<CpuPhiAllocator>()};
+};
+
+class GpuPhiContext : public phi::GPUContext {
+ public:
+  using Base = phi::GPUContext;
+  using phi::GPUContext::SetStream;
+  using phi::GPUContext::SetEigenDevice;
+  using phi::GPUContext::SetBlasHandle;
+  using phi::GPUContext::SetDnnHandle;
+  using phi::GPUContext::SetSolverHandle;
+  using phi::GPUContext::SetSparseHandle;
 };
 
 }  // namespace backends
diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
index 12cf14060e27c..89dd3b0dc7abf 100644
--- a/paddle/infrt/backends/tensorrt/test_trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
@@ -37,9 +37,9 @@ namespace infrt {
 namespace backends {
 namespace tensorrt {
 
-const char* model_input = "model_input";
-const char* model_output = "model_output1";
-const char* model_output2 = "model_output2";
+const char* model_input = "input_0";
+const char* model_output = "output_0";
+const char* model_output2 = "output_1";
 
 TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructNetwork(
     nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) {
@@ -82,9 +82,176 @@ TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructNetwork(
   return network;
 }
 
+TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructFCNetwork(
+    nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) {
+  TrtUniquePtr<nvinfer1::INetworkDefinition> network;
+  if (is_static_shape) {
+    network.reset(builder->createNetworkV2(0U));
+  } else {
+    auto networkFlags =
+        1U << static_cast<uint32_t>(
+            nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+    network.reset(builder->createNetworkV2(networkFlags));
+  }
+
+  ITensor* data =
+      network->addInput(model_input, nvinfer1::DataType::kFLOAT, dims);
+  CHECK_NOTNULL(data);
+  nvinfer1::Weights kernel_weights;
+  kernel_weights.type = nvinfer1::DataType::kFLOAT;
+  kernel_weights.count = 7840;
+  std::vector<float> weight_data(kernel_weights.count);
+  for (size_t i = 0; i < weight_data.size(); ++i) {
+    weight_data[i] = i % 255 * 0.02f;
+  }
+  kernel_weights.values = weight_data.data();
+  auto* layer = network->addFullyConnected(
+      *data, 10, kernel_weights, nvinfer1::Weights{});
+  CHECK_NOTNULL(layer);
+  auto* out = layer->getOutput(0);
+  out->setName(model_output);
+  network->markOutput(*out);
+  return network;
+}
+
+TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructConvNetwork(
+    nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) {
+  TrtUniquePtr<nvinfer1::INetworkDefinition> network;
+  if (is_static_shape) {
+    network.reset(builder->createNetworkV2(0U));
+  } else {
+    auto networkFlags =
+        1U << static_cast<uint32_t>(
+            nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+    network.reset(builder->createNetworkV2(networkFlags));
+  }
+
+  ITensor* data =
+      network->addInput(model_input, nvinfer1::DataType::kFLOAT, dims);
+  CHECK_NOTNULL(data);
+  nvinfer1::Weights kernel_weights, bias_weights;
+  kernel_weights.type = nvinfer1::DataType::kFLOAT;
+  bias_weights.type = nvinfer1::DataType::kFLOAT;
+  kernel_weights.count = 81;
+  bias_weights.count = 3;
+  std::vector<float> weight_data(kernel_weights.count);
+  for (size_t i = 0; i < weight_data.size(); ++i) {
+    weight_data[i] = i * 0.02f;
+  }
+  std::vector<float> bias_data(bias_weights.count);
+  for (size_t i = 0; i < bias_data.size(); ++i) {
+    bias_data[i] = i * 0.5f;
+  }
+  kernel_weights.values = weight_data.data();
+  bias_weights.values = bias_data.data();
+  nvinfer1::Dims ksize;
+  ksize.nbDims = 2;
+  ksize.d[0] = 3;
+  ksize.d[1] = 3;
+  auto* layer =
+      network->addConvolutionNd(*data, 3, ksize, kernel_weights, bias_weights);
+  CHECK_NOTNULL(layer);
+  auto* out = layer->getOutput(0);
+  out->setName(model_output);
+  network->markOutput(*out);
+  return network;
+}
+
 // sigmoid(x) = 1 / (1 + exp(-x))
 inline float sigmoid(float x) { return 1.f / (1.f + exp(-1 * x)); }
 
+TEST(trt, run_fc_static) {
+  TrtEngine engine(0);
+  auto net = ConstructFCNetwork(
+      engine.GetTrtBuilder(), nvinfer1::Dims3{1, 28, 28}, true);
+  BuildOptions build_options;
+  build_options.max_batch = 4;
+  build_options.workspace = 1024;
+  engine.Build(std::move(net), build_options);
+
+  InferenceOptions inference_options;
+  inference_options.batch = 1;
+
+  phi::GPUPlace place;
+  phi::GPUContext context;
+  context.PartialInitWithoutAllocator();
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
+
+  phi::DenseTensorMeta meta(
+      phi::DataType::FLOAT32,
+      phi::make_ddim({inference_options.batch, 1, 28, 28}));
+  phi::DenseTensor input;
+  input.set_meta(meta);
+  context.Alloc<float>(&input, input.numel() * sizeof(float));
+  std::vector<float> host_data(inference_options.batch * 1 * 28 * 28, 0);
+  for (size_t i = 0; i < host_data.size(); ++i) {
+    host_data[i] = i % 100 * 0.016f;
+  }
+  paddle::memory::Copy(place,
+                       input.data<float>(),
+                       phi::CPUPlace(),
+                       host_data.data(),
+                       sizeof(float) * host_data.size(),
+                       context.stream());
+
+  std::unordered_map<std::string, phi::DenseTensor*> inputs;
+  inputs.emplace(std::make_pair(model_input, &input));
+  engine.PrepareOutputHandle("output_0");
+  engine.SetUpInference(inference_options, inputs);
+  engine.GetEngineInfo();
+  engine.Run(context);
+  cudaStreamSynchronize(context.stream());
+}
+
+TEST(trt, run_conv_static) {
+  TrtEngine engine(0);
+  auto net = ConstructConvNetwork(
+      engine.GetTrtBuilder(), nvinfer1::Dims3{3, 28, 28}, true);
+  BuildOptions build_options;
+  build_options.max_batch = 4;
+  build_options.workspace = 1024;
+  engine.Build(std::move(net), build_options);
+
+  InferenceOptions inference_options;
+  inference_options.batch = 1;
+
+  phi::GPUPlace place;
+  phi::GPUContext context;
+  context.PartialInitWithoutAllocator();
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
+
+  phi::DenseTensorMeta meta(
+      phi::DataType::FLOAT32,
+      phi::make_ddim({inference_options.batch, 3, 28, 28}));
+  phi::DenseTensor input;
+  input.set_meta(meta);
+  context.Alloc<float>(&input, input.numel() * sizeof(float));
+  std::vector<float> host_data(inference_options.batch * 3 * 28 * 28, 0);
+  for (size_t i = 0; i < host_data.size(); ++i) {
+    host_data[i] = i % 100 * 0.016f;
+  }
+  paddle::memory::Copy(place,
+                       input.data<float>(),
+                       phi::CPUPlace(),
+                       host_data.data(),
+                       sizeof(float) * host_data.size(),
+                       context.stream());
+
+  std::unordered_map<std::string, phi::DenseTensor*> inputs;
+  inputs.emplace(std::make_pair(model_input, &input));
+  engine.PrepareOutputHandle("output_0");
+  engine.SetUpInference(inference_options, inputs);
+  engine.GetEngineInfo();
+  engine.Run(context);
+  cudaStreamSynchronize(context.stream());
+}
+
 TEST(trt, run_static) {
   TrtEngine static_trt_engine(0);
   auto net = ConstructNetwork(
@@ -122,27 +289,26 @@ TEST(trt, run_static) {
 
   std::unordered_map<std::string, phi::DenseTensor*> inputs;
   inputs.emplace(std::make_pair(model_input, &input));
-  phi::DenseTensor output, output2;
-  std::unordered_map<std::string, phi::DenseTensor*> outputs;
-  outputs.emplace(std::make_pair(model_output, &output));
-  outputs.emplace(std::make_pair(model_output2, &output2));
-
-  static_trt_engine.SetUpInference(inference_options, inputs, &outputs);
+  static_trt_engine.PrepareOutputHandle("output_0");
+  static_trt_engine.PrepareOutputHandle("output_1");
+  static_trt_engine.SetUpInference(inference_options, inputs);
   static_trt_engine.GetEngineInfo();
   static_trt_engine.Run(context);
 
+  phi::DenseTensor* output0 = static_trt_engine.GetOutput("output_0");
+  phi::DenseTensor* output1 = static_trt_engine.GetOutput("output_1");
   std::vector<float> output_data1(inference_options.batch * 1 * 28 * 28, 0);
   std::vector<float> output_data2(inference_options.batch * 2 * 28 * 28, 0);
   paddle::memory::Copy(phi::CPUPlace(),
                        output_data1.data(),
                        place,
-                       output.data<float>(),
+                       output0->data<float>(),
                        sizeof(float) * output_data1.size(),
                        context.stream());
   paddle::memory::Copy(phi::CPUPlace(),
                        output_data2.data(),
                        place,
-                       output2.data<float>(),
+                       output1->data<float>(),
                        sizeof(float) * output_data2.size(),
                        context.stream());
   cudaStreamSynchronize(context.stream());
@@ -208,27 +374,27 @@ TEST(trt, run_dynamic) {
                        context.stream());
 
   std::unordered_map<std::string, phi::DenseTensor*> inputs;
-  std::unordered_map<std::string, phi::DenseTensor*> outputs;
   inputs.emplace(std::make_pair(model_input, &input));
-  outputs.emplace(std::make_pair(model_output, &output));
-  outputs.emplace(std::make_pair(model_output2, &output2));
-
-  engine.SetUpInference(inference_options, inputs, &outputs);
+  engine.PrepareOutputHandle("output_0");
+  engine.PrepareOutputHandle("output_1");
+  engine.SetUpInference(inference_options, inputs);
   engine.GetEngineInfo();
   engine.Run(context);
+  phi::DenseTensor* output0 = engine.GetOutput("output_0");
+  phi::DenseTensor* output1 = engine.GetOutput("output_1");
 
   std::vector<float> output_data1(inference_options.batch * 1 * 16 * 16, 0);
   std::vector<float> output_data2(inference_options.batch * 2 * 16 * 16, 0);
   paddle::memory::Copy(phi::CPUPlace(),
                        output_data1.data(),
                        place,
-                       output.data<float>(),
+                       output0->data<float>(),
                        sizeof(float) * output_data1.size(),
                        context.stream());
   paddle::memory::Copy(phi::CPUPlace(),
                        output_data2.data(),
                        place,
-                       output2.data<float>(),
+                       output1->data<float>(),
                        sizeof(float) * output_data2.size(),
                        context.stream());
   cudaStreamSynchronize(context.stream());
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc
index 232653e8c41f7..43d356b6d6983 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/trt_engine.cc
@@ -21,6 +21,7 @@
 #include "paddle/phi/backends/dynload/tensorrt.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
 namespace backends {
@@ -235,10 +236,20 @@ bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build,
   return true;
 }
 
+void TrtEngine::PrepareOutputHandle(const std::string& out_name) {
+  phi::DenseTensor t;
+  outputs_.emplace(out_name, t);
+}
+
+phi::DenseTensor* TrtEngine::GetOutput(const std::string& name) {
+  return &outputs_[name];
+}
+
+size_t TrtEngine::GetOutputNum() const { return outputs_.size(); }
+
 bool TrtEngine::SetUpInference(
     const InferenceOptions& inference,
-    const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
-    std::unordered_map<std::string, phi::DenseTensor*>* outputs) {
+    const std::unordered_map<std::string, phi::DenseTensor*>& inputs) {
   // TODO(wilber): now only create one exec_context
   FreshDeviceId();
   CHECK(engine_ != nullptr);
@@ -252,10 +263,10 @@ bool TrtEngine::SetUpInference(
     bindings_.front()->AddBinding(
         bind_index, it.first, true, it.second, nvinfer1::DataType::kFLOAT);
   }
-  for (auto& it : *outputs) {
+  for (auto& it : outputs_) {
     const int bind_index = engine_->getBindingIndex(it.first.c_str());
     bindings_.front()->AddBinding(
-        bind_index, it.first, false, it.second, nvinfer1::DataType::kFLOAT);
+        bind_index, it.first, false, &it.second, nvinfer1::DataType::kFLOAT);
   }
 
   return true;
@@ -290,11 +301,13 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) {
     const int bind_index = engine_->getBindingIndex(bind.name.c_str());
     std::vector<int32_t> ddim;
     auto dims = engine_->getBindingDimensions(bind_index);
+    CHECK_NE(runtime_batch, -1) << "runtime_batch should not be -1.";
     ddim.push_back(runtime_batch);
     for (int i = 0; i < dims.nbDims; ++i) {
       ddim.push_back(dims.d[i]);
     }
     bind.buffer->Resize(phi::make_ddim(ddim));
+    // TODO(wilber): now only support float output.
     ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel());
     buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>());
   }
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h
index 3c8243e3c3838..a26474f8cbb35 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.h
+++ b/paddle/infrt/backends/tensorrt/trt_engine.h
@@ -81,11 +81,17 @@ class TrtEngine {
   // TODO(wilber): How to support multiple execution contexts?
   bool SetUpInference(
       const InferenceOptions& inference,
-      const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
-      std::unordered_map<std::string, phi::DenseTensor*>* outputs);
+      const std::unordered_map<std::string, phi::DenseTensor*>& inputs);
 
   void GetEngineInfo();
 
+  void PrepareOutputHandle(const std::string& out_name);
+
+  // TODO(wilber): The output tensor names are: output_0, output_1, ...
+  phi::DenseTensor* GetOutput(const std::string&);
+
+  size_t GetOutputNum() const;
+
  private:
   void FreshDeviceId();
 
@@ -112,6 +118,7 @@ class TrtEngine {
   std::vector<std::unique_ptr<Bindings>> bindings_;
   int device_id_{0};
   bool is_dynamic_shape_{false};
+  std::unordered_map<std::string, phi::DenseTensor> outputs_;
 };
 
 }  // namespace tensorrt
diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt
index e35989da2085b..cf3906c32e559 100644
--- a/paddle/infrt/dialect/CMakeLists.txt
+++ b/paddle/infrt/dialect/CMakeLists.txt
@@ -2,27 +2,15 @@ core_gather_headers()
 
 gather_srcs(infrt_src SRCS
     dialect.cc
-    basic_kernels.cc
-    test_kernels.cc
-    infrt_base.cc
-    init_infrt_dialects.cc
+    init_dialects.cc
     tensor_shape.cc
     dense_tensor.cc
     mlir_loader.cc
     diagnostic_utils.cc
-    pd_types.cc
-    pd_ops.cc
     )
 
-mlir_tablegen_on(basic_kernels)
-mlir_tablegen_on(test_kernels)
-mlir_tablegen_on(infrt_base DIALECT Infrt)
 mlir_tablegen_on(tensor_shape DIALECT ts)
 mlir_tablegen_on(dense_tensor DIALECT dt)
-mlir_tablegen_on(pd_op_base DIALECT pd)
-mlir_tablegen_on(pd_ops)
-mlir_tablegen_on(pd_extra_ops)
-mlir_add_rewriter(rewrite)
 
 # TODO(Superjomn) add a cmake function cc_executable to ecapsulate the following code
 add_executable(infrtopt opt.cc)
@@ -30,10 +18,10 @@ target_link_libraries(infrtopt infrt)
 
 add_executable(print-ir print_ir.cc)
 target_link_libraries(print-ir infrt ${mlir_libs})
-add_dependencies(print-ir pd_ops_inc)
 cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt ${MLIR_IR_LIBS})
 
 add_subdirectory(infrt)
+add_subdirectory(pd)
 add_subdirectory(tensorrt)
 
 if (INFRT_WITH_PHI)
diff --git a/paddle/infrt/dialect/dense_tensor.cc b/paddle/infrt/dialect/dense_tensor.cc
index 49d6887ada032..7b8d48ff71647 100644
--- a/paddle/infrt/dialect/dense_tensor.cc
+++ b/paddle/infrt/dialect/dense_tensor.cc
@@ -38,23 +38,6 @@ void DTDialect::initialize() {
 #include "paddle/infrt/dialect/dense_tensor.cpp.inc"
       >();
 }
-
-TensorMapType TensorMapType::get() {
-  return Base::get(::infrt::Global::getMLIRContext());
-}
-
-TensorMapType TensorMapType::get(mlir::MLIRContext *context) {
-  return Base::get(context);
-}
-
-StringType StringType::get() {
-  return Base::get(::infrt::Global::getMLIRContext());
-}
-
-StringType StringType::get(mlir::MLIRContext *context) {
-  return Base::get(context);
-}
-
 static mlir::Type getTensorType(mlir::MLIRContext *context) {
   auto t_dialect = mlir::Identifier::get("t", context);
   return mlir::OpaqueType::get(t_dialect, "tensor");
diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h
index b0a1ea412c53e..7fbd1e8a4efe1 100644
--- a/paddle/infrt/dialect/dense_tensor.h
+++ b/paddle/infrt/dialect/dense_tensor.h
@@ -19,28 +19,7 @@
 
 #include <string>
 
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
-
-namespace infrt {
-namespace dt {
-class TensorMapType : public mlir::Type::TypeBase<TensorMapType,
-                                                  mlir::Type,
-                                                  mlir::TypeStorage> {
- public:
-  using Base::Base;
-  static TensorMapType get();
-  static TensorMapType get(mlir::MLIRContext *context);
-};
-
-class StringType
-    : public mlir::Type::TypeBase<StringType, mlir::Type, mlir::TypeStorage> {
- public:
-  using Base::Base;
-  static StringType get();
-  static StringType get(mlir::MLIRContext *context);
-};
-}  // namespace dt
-}  // namespace infrt
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 
 #include "paddle/infrt/dialect/dense_tensor_dialect.hpp.inc"
 
diff --git a/paddle/infrt/dialect/dense_tensor.td b/paddle/infrt/dialect/dense_tensor.td
index 7e6e838a72372..822a4879e6f59 100644
--- a/paddle/infrt/dialect/dense_tensor.td
+++ b/paddle/infrt/dialect/dense_tensor.td
@@ -2,7 +2,7 @@
 #else
 #define DT_OPS
 
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "paddle/infrt/dialect/tensor_shape_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
@@ -105,11 +105,10 @@ def LoadParamsOp : DT_Op<"load_params", [NoSideEffect]> {
   }];
 
   // input path of model params.
-  let arguments = (ins StringType:$path);
-  let results = (outs TensorMapType);
+  let arguments = (ins StrAttr:$path);
+  let results = (outs DenseHostTensorMap:$out);
 
-  let assemblyFormat = "`(` operands `)` attr-dict";
-  let verifier = ?;
+  let assemblyFormat = "`(``)`attr-dict";
 }
 
 
@@ -122,7 +121,7 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> {
 
   // input path of model params.
   let arguments = (ins
-          TensorMapType:$map,
+          DenseHostTensorMap:$map,
           StrAttr:$name
           );
   let results = (outs DenseTensor:$output);
@@ -131,17 +130,43 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> {
 }
 
 def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> {
-  let summary = "ddt.tensor_map_get_size operation";
+  let summary = "dt.tensor_map_get_size operation";
 
   let description = [{
     An operation that get the size of a TensorMap.
   }];
 
-  let arguments = (ins TensorMapType:$map);
+  let arguments = (ins DenseHostTensorMap:$map);
   let results = (outs I32:$size);
   let assemblyFormat = "`(` $map `)` attr-dict `->` type($size)";
 }
 
+def Infrt_TensorListGetTensorOp : DT_Op<"tensor_list_get_tensor", [NoSideEffect]> {
+  let summary = "dt.tensor_list_get_tensor operation";
+
+  let description = [{
+    An operation that can get a tensor from a TensorList.
+  }];
+
+  let arguments = (ins
+          DenseTensorList:$l,
+          I32Attr:$id
+          );
+  let results = (outs DenseTensor:$output);
+  let verifier = ?;
+}
+
+def TensorListGetSizeOp : DT_Op<"tensor_list_get_size", [NoSideEffect]> {
+  let summary = "dt.tensor_list_get_size operation";
+
+  let description = [{
+    An operation that get the size of a TensorList.
+  }];
+
+  let arguments = (ins DenseTensorList:$map);
+  let results = (outs I32:$size);
+}
+
 def GetTensorShapeOp : DT_Op<"get_tensor_shape", [NoSideEffect]> {
   let summary = "dt.get_tensor_shape operation";
 
diff --git a/paddle/infrt/dialect/infrt/CMakeLists.txt b/paddle/infrt/dialect/infrt/CMakeLists.txt
index 08ce2d4707bfd..5f65336453fbd 100644
--- a/paddle/infrt/dialect/infrt/CMakeLists.txt
+++ b/paddle/infrt/dialect/infrt/CMakeLists.txt
@@ -1,17 +1,3 @@
-core_gather_headers()
-
-gather_srcs(infrt_src SRCS
-    common_type.cc
-    infrt_dialect.cc
-    )
-
-
-add_mlir_dialect(infrt_ops infrt)
-
-set(LLVM_TARGET_DEFINITIONS infrt_ops.td)
-mlir_tablegen(infrt_opsAttributes.h.inc -gen-attrdef-decls -dialect=infrt)
-mlir_tablegen(infrt_opsAttributes.cpp.inc -gen-attrdef-defs -dialect=infrt)
-add_public_tablegen_target(MLIRinfrt_opsAttributesIncGen)
-add_dependencies(mlir-headers MLIRinfrt_opsAttributesIncGen)
-
+add_subdirectory(common)
+add_subdirectory(ir)
 add_subdirectory(pass)
diff --git a/paddle/infrt/dialect/infrt/common/CMakeLists.txt b/paddle/infrt/dialect/infrt/common/CMakeLists.txt
new file mode 100644
index 0000000000000..f693c82b5060e
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/common/CMakeLists.txt
@@ -0,0 +1,6 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    types.cc
+    utils.cc
+    )
diff --git a/paddle/infrt/dialect/infrt/common_type.cc b/paddle/infrt/dialect/infrt/common/types.cc
similarity index 87%
rename from paddle/infrt/dialect/infrt/common_type.cc
rename to paddle/infrt/dialect/infrt/common/types.cc
index 00684c505268c..c10679b01342f 100644
--- a/paddle/infrt/dialect/infrt/common_type.cc
+++ b/paddle/infrt/dialect/infrt/common/types.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/infrt/common_type.h"
+#include "paddle/infrt/dialect/infrt/common/types.h"
 
 namespace infrt {
 
@@ -30,6 +30,8 @@ llvm::Optional<LayoutType> GetLayoutType(llvm::StringRef key) {
     return LayoutType::NCHW;
   else if (key.equals_insensitive("NHWC"))
     return LayoutType::NHWC;
+  else if (key.equals_insensitive("ANY"))
+    return LayoutType::ANY;
   else
     return llvm::None;
 }
@@ -39,6 +41,8 @@ llvm::Optional<PrecisionType> GetPrecisionType(llvm::StringRef key) {
     return PrecisionType::FLOAT32;
   else if (key.equals_insensitive("FP16"))
     return PrecisionType::FLOAT16;
+  else if (key.equals_insensitive("UNK"))
+    return PrecisionType::UNK;
   else
     return llvm::None;
 }
@@ -67,6 +71,9 @@ llvm::StringRef GetString(LayoutType type) {
     case (LayoutType::NHWC):
       str = "NHWC";
       break;
+    case (LayoutType::ANY):
+      str = "ANY";
+      break;
     default:
       str = "Unsupported";
   }
@@ -82,6 +89,9 @@ llvm::StringRef GetString(PrecisionType type) {
     case (PrecisionType::FLOAT16):
       str = "FP16";
       break;
+    case (PrecisionType::UNK):
+      str = "UNK";
+      break;
     default:
       str = "Unsupported";
   }
diff --git a/paddle/infrt/dialect/infrt/common_type.h b/paddle/infrt/dialect/infrt/common/types.h
similarity index 100%
rename from paddle/infrt/dialect/infrt/common_type.h
rename to paddle/infrt/dialect/infrt/common/types.h
diff --git a/paddle/infrt/dialect/infrt/common/utils.cc b/paddle/infrt/dialect/infrt/common/utils.cc
new file mode 100644
index 0000000000000..0ffb23c490f8f
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/common/utils.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/infrt/common/utils.h"
+
+mlir::SmallVector<mlir::Value, 4> infrt::cvtValueToValueRange(
+    const mlir::Value &operand) {
+  return mlir::SmallVector<mlir::Value, 4>(1, operand);
+}
+
+mlir::SmallVector<mlir::Value, 4> infrt::concatTwoValueRange(
+    mlir::ValueRange operand_0, mlir::ValueRange operand_1) {
+  mlir::SmallVector<mlir::Value, 4> operands;
+  operands.append(operand_0.begin(), operand_0.end());
+  operands.append(operand_1.begin(), operand_1.end());
+  return operands;
+}
diff --git a/paddle/fluid/operators/searchsorted_op.cu b/paddle/infrt/dialect/infrt/common/utils.h
similarity index 57%
rename from paddle/fluid/operators/searchsorted_op.cu
rename to paddle/infrt/dialect/infrt/common/utils.h
index 4633ab43efba1..886407b56649a 100644
--- a/paddle/fluid/operators/searchsorted_op.cu
+++ b/paddle/infrt/dialect/infrt/common/utils.h
@@ -12,12 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/searchsorted_op.h"
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
+#pragma once
 
-REGISTER_OP_CUDA_KERNEL(
-    searchsorted, ops::SearchSortedKernel<plat::CUDADeviceContext, float>,
-    ops::SearchSortedKernel<plat::CUDADeviceContext, double>,
-    ops::SearchSortedKernel<plat::CUDADeviceContext, int>,
-    ops::SearchSortedKernel<plat::CUDADeviceContext, int64_t>);
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/DialectImplementation.h>
+#include <mlir/IR/MLIRContext.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/IR/Types.h>
+
+namespace infrt {
+
+mlir::SmallVector<mlir::Value, 4> cvtValueToValueRange(
+    const mlir::Value &operand);
+
+mlir::SmallVector<mlir::Value, 4> concatTwoValueRange(
+    mlir::ValueRange operand_0, mlir::ValueRange operand_1);
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt/infrt_ops.td b/paddle/infrt/dialect/infrt/infrt_ops.td
deleted file mode 100644
index ecd7093e72b8a..0000000000000
--- a/paddle/infrt/dialect/infrt/infrt_ops.td
+++ /dev/null
@@ -1,26 +0,0 @@
-include "paddle/infrt/dialect/infrt/infrt_ops_base.td"
-
-// Op definition
-class Infrt_Op<string mnemonic, list<OpTrait> traits = []> : Op<Infrt_Dialect, mnemonic, traits> {
-
-  // Each registered op needs to provide all of a printer, parser and verifier.
-  // let printer = [{ return infrt::print(p, *this); }];
-  // let verifier = [{ return infrt::verify(*this); }];
-  // let parser = [{ return infrt::parse$cppClass(parser, result); }];
-}
-
-def Infrt_KernelOp : Infrt_Op<"kernel", [NoSideEffect]> {
-  let summary = "kernel op";
-  let description = [{kernel op!}];
-  let arguments = (ins Variadic<AnyType>:$operands,
-                       StrAttr:$name,
-                       OptionalAttr<DictionaryAttr>:$attrs);
-  let results = (outs Variadic<AnyType>);
-}
-
-def Infrt_CvtTensorOp : Infrt_Op<"cvt_tensor", [NoSideEffect]> {
-  let summary = "convert tensor type op";
-  let description = [{convert tensor type op!}];
-  let arguments = (ins AnyType:$input);
-  let results = (outs AnyType:$output);
-}
diff --git a/paddle/infrt/dialect/infrt/ir/CMakeLists.txt b/paddle/infrt/dialect/infrt/ir/CMakeLists.txt
new file mode 100644
index 0000000000000..7c009bdb267e6
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/ir/CMakeLists.txt
@@ -0,0 +1,18 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    infrt_dialect.cc
+    basic_kernels.cc
+    test_kernels.cc
+    )
+
+add_mlir_dialect(infrt_ops infrt)
+
+set(LLVM_TARGET_DEFINITIONS infrt_ops.td)
+mlir_tablegen(infrt_opsAttributes.h.inc -gen-attrdef-decls -dialect=infrt)
+mlir_tablegen(infrt_opsAttributes.cpp.inc -gen-attrdef-defs -dialect=infrt)
+add_public_tablegen_target(MLIRinfrt_opsAttributesIncGen)
+add_dependencies(mlir-headers MLIRinfrt_opsAttributesIncGen)
+
+mlir_tablegen_on(basic_kernels)
+mlir_tablegen_on(test_kernels)
diff --git a/paddle/infrt/dialect/basic_kernels.cc b/paddle/infrt/dialect/infrt/ir/basic_kernels.cc
similarity index 63%
rename from paddle/infrt/dialect/basic_kernels.cc
rename to paddle/infrt/dialect/infrt/ir/basic_kernels.cc
index c1aa75fb24650..ba83f3e36c94a 100644
--- a/paddle/infrt/dialect/basic_kernels.cc
+++ b/paddle/infrt/dialect/infrt/ir/basic_kernels.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
 
 #include <llvm/ADT/STLExtras.h>
 #include <mlir/IR/Attributes.h>
@@ -30,23 +30,6 @@ namespace infrt {
 namespace dialect {
 using namespace mlir;  // NOLINT
 
-static ParseResult parseCallOp(OpAsmParser &parser,       // NOLINT
-                               OperationState &result) {  // NOLINT
-  SymbolRefAttr callee_attr;
-  FunctionType callee_type;
-  SmallVector<OpAsmParser::OperandType, 4> operands;
-  auto callee_loc = parser.getNameLoc();
-  if (parser.parseAttribute(callee_attr, "callee", result.attributes) ||
-      parser.parseOperandList(operands, OpAsmParser::Delimiter::Paren) ||
-      parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonType(callee_type) ||
-      parser.addTypesToList(callee_type.getResults(), result.types) ||
-      parser.resolveOperands(
-          operands, callee_type.getInputs(), callee_loc, result.operands))
-    return failure();
-  return success();
-}
-
 static ParseResult parseConstantOp(Type attrType,
                                    OpAsmParser &parser,       // NOLINT
                                    OperationState &result) {  // NOLINT
@@ -79,24 +62,6 @@ static ParseResult parseConstantI64Op(OpAsmParser &parser,       // NOLINT
       IntegerType::get(result.getContext(), 64), parser, result);
 }
 
-static ParseResult parseReturnOp(OpAsmParser &parser,       // NOLINT
-                                 OperationState &result) {  // NOLINT
-  SmallVector<OpAsmParser::OperandType, 2> opInfo;
-  SmallVector<Type, 2> types;
-  llvm::SMLoc loc = parser.getCurrentLocation();
-  return failure(parser.parseOperandList(opInfo) ||
-                 (!opInfo.empty() && parser.parseColonTypeList(types)) ||
-                 parser.resolveOperands(opInfo, types, loc, result.operands));
-}
-
-static void print(OpAsmPrinter &p, CallOp op) {  // NOLINT
-  p << op->getAttr("callee") << "(";
-  p.printOperands(op.getOperands());
-  p << ")";
-  p.printOptionalAttrDict(op->getAttrs(), {"callee"});
-  p << " : ";
-}
-
 static void printConstant(OpAsmPrinter &p, mlir::Operation *op) {  // NOLINT
   p << " ";
   p.printOptionalAttrDict(op->getAttrs(), /*elidedAttrs=*/{"value"});
@@ -127,37 +92,13 @@ static void print(OpAsmPrinter &p, ConstantI64Op op) {  // NOLINT
   printConstant(p, op);
 }
 
-static void print(OpAsmPrinter &p, ReturnOp op) {  // NOLINT
-  if (op.getNumOperands() > 0) {
-    p << ' ';
-    p.printOperands(op.getOperands());
-    p << " : ";
-    llvm::interleaveComma(op.getOperands(), p);
-  }
-}
-
-static LogicalResult verify(CallOp op) { return success(); }
-
 static LogicalResult verify(ConstantF32Op op) { return success(); }
 static LogicalResult verify(ConstantI32Op op) { return success(); }
 static LogicalResult verify(ConstantF64Op op) { return success(); }
 static LogicalResult verify(ConstantI64Op op) { return success(); }
 
-static LogicalResult verify(ReturnOp op) {
-  auto function = dyn_cast<FuncOp>(op->getParentOp());
-
-  if (!function) return success();
-
-  auto results = function.getType().getResults();
-  if (op.getNumOperands() != results.size())
-    return op.emitOpError("has ")
-           << op.getNumOperands()
-           << " operands, but enclosing function returns " << results.size();
-
-  return success();
-}
 }  // namespace dialect
 }  // namespace infrt
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/basic_kernels.cpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.cpp.inc"
diff --git a/paddle/infrt/dialect/basic_kernels.h b/paddle/infrt/dialect/infrt/ir/basic_kernels.h
similarity index 92%
rename from paddle/infrt/dialect/basic_kernels.h
rename to paddle/infrt/dialect/infrt/ir/basic_kernels.h
index b82abcd52d28f..a36f55691b716 100644
--- a/paddle/infrt/dialect/basic_kernels.h
+++ b/paddle/infrt/dialect/infrt/ir/basic_kernels.h
@@ -18,4 +18,4 @@
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/basic_kernels.hpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.hpp.inc"
diff --git a/paddle/infrt/dialect/basic_kernels.td b/paddle/infrt/dialect/infrt/ir/basic_kernels.td
similarity index 63%
rename from paddle/infrt/dialect/basic_kernels.td
rename to paddle/infrt/dialect/infrt/ir/basic_kernels.td
index aadc146e36280..60315b45dd0df 100644
--- a/paddle/infrt/dialect/basic_kernels.td
+++ b/paddle/infrt/dialect/infrt/ir/basic_kernels.td
@@ -4,10 +4,10 @@
 #else
 #define BASIC_OPS
 
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
-class INFRT_Op<string mnemonic, list<OpTrait> traits = []> : Op<INFRT_Dialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
+class INFRT_Op<string mnemonic, list<OpTrait> traits = []> : Op<Infrt_Dialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
 
   // Each registered op needs to provide all of a printer, parser and verifier.
   let printer = [{ return infrt::dialect::print(p, *this); }];
@@ -15,23 +15,6 @@ class INFRT_Op<string mnemonic, list<OpTrait> traits = []> : Op<INFRT_Dialect, m
   let parser = [{ return infrt::dialect::parse$cppClass(parser, result); }];
 }
 
-def CallOp : INFRT_Op<"call"> {
-  let summary = "call a host operation";
-  let description = [{
-      The "infrt.call" operation represents a direct call to a function. The operands and result types of the call must match the specified function type.
-
-          %2 = infrt.call @add(%0, %1) : (f32, f32) -> f32
-    }];
-
-  let arguments = (ins FlatSymbolRefAttr:$callee, Variadic<AnyType>:$operands);
-  let results = (outs Variadic<AnyType>);
-
-  let extraClassDeclaration = [{
-      mlir::StringRef getCallee() { return callee(); }
-      mlir::FunctionType getCalleeType();
-    }];
-}
-
 class ConstantOp<string suffix, Type baseType, Attr attr>
     : INFRT_Op<"constant." # suffix, [NoSideEffect]> {
   let summary = "constant value constructor in host";
@@ -45,22 +28,6 @@ def ConstantI64Op : ConstantOp<"i64", I64, I64Attr>;
 def ConstantF32Op : ConstantOp<"f32", F32, F32Attr>;
 def ConstantF64Op : ConstantOp<"f64", F64, F64Attr>;
 
-def ReturnOp : INFRT_Op<"return", [Terminator]> {
-  let summary = "host executor return operation";
-  let description = [{
-      The "Infrt.return" operation represents a return operation within a function.
-
-        func @foo() : (i32, f8) {
-        Infrt.return %0, %1 : i32, f8
-        }
-    }];
-
-  let arguments = (ins Variadic<AnyType>:$operands);
-
-  let builders = [OpBuilder<(ins),
-                  [{ build($_builder, $_state, llvm::None); }]>];
-}
-
 class AddOp<string suffix, Type type> : INFRT_Op<"add." # suffix, [NoSideEffect]> {
   let summary = "infrt.add operation";
   let description = [{
@@ -111,25 +78,13 @@ def PrintI64Op : PrintOp<"i64", I64>;
 def PrintF32Op : PrintOp<"f32", F32>;
 def PrintF64Op : PrintOp<"f64", F64>;
 
-def GetStringOp : INFRT_Op<"get_string"> {
-  let summary = "Infrt.get_string";
-  let description = [{
-    Get a !infrt.string value from the given string attribute.
-  }];
-
-  let arguments = (ins StrAttr:$value);
-  let results = (outs StringType);
-  let assemblyFormat = "`(` $value `)` attr-dict";
-  let verifier = ?;
-}
-
 def PrintStringOp : INFRT_Op<"print_string"> {
-  let summary = "Infrt.print_string";
+  let summary = "infrt.print_string";
   let description = [{
       An operation that prints a string.
   }];
 
-  let arguments = (ins StringType:$input);
+  let arguments = (ins StrAttr:$input);
   let results = (outs);
   let assemblyFormat = "`(` $input `)` attr-dict";
   let verifier = ?;
diff --git a/paddle/infrt/dialect/infrt/infrt_ops_base.td b/paddle/infrt/dialect/infrt/ir/infrt_base.td
similarity index 77%
rename from paddle/infrt/dialect/infrt/infrt_ops_base.td
rename to paddle/infrt/dialect/infrt/ir/infrt_base.td
index 8a6eb766567dc..9b1d2132292df 100644
--- a/paddle/infrt/dialect/infrt/infrt_ops_base.td
+++ b/paddle/infrt/dialect/infrt/ir/infrt_base.td
@@ -83,6 +83,19 @@ def DenseTensor : Infrt_Type<"DenseTensor"> {
   );
 }
 
+def DenseHostTensorMap :  Infrt_Type<"DenseHostTensorMap"> {
+  let summary = "infrt dense tensor map";
+  let description = [{dense_tensor map}];
+  let parameters = (ins);
+}
+
+// TODO(wilber): Add !infrt.vec type.
+def DenseTensorList :  Infrt_Type<"DenseTensorList"> {
+  let summary = "infrt dense tensor map";
+  let description = [{dense_tensor map}];
+  let parameters = (ins);
+}
+
 // Type Constrait for concrete DenseTensor type.
 class DenseTensor<string target, string precision, string layout> :
     Type<CPred<"$_self == ::infrt::DenseTensorType::get($_self.getContext(), ::infrt::TargetType::"#target#",::infrt::PrecisionType::"#precision#",::infrt::LayoutType::"#layout#")">, 
@@ -95,4 +108,21 @@ class Infrt_Attr<string name, list<Trait> traits = [],
     : AttrDef<Infrt_Dialect, name, traits, baseCppClass> {
   let mnemonic = ?;
 }
+
+// tools function. used for pattern rewriter
+class INFRT_createI32Attr<string value> : NativeCodeCall<
+    "$_builder.getI32IntegerAttr(" # value # ")">;
+
+class INFRT_createSI32Attr<string value> : NativeCodeCall<
+    "$_builder.getSI32IntegerAttr(" # value # ")">;
+
+class INFRT_createF32Attr<string value> : NativeCodeCall<
+    "$_builder.getF32FloatAttr(" # value # ")">;
+
+def INFRT_cvtValueToValueRange : NativeCodeCall<
+    "infrt::cvtValueToValueRange($0)">;
+
+def INFRT_concatTwoValueRange : NativeCodeCall<
+    "infrt::concatTwoValueRange($0, $1)">;
+
 #endif // INFRT_OPS_BASE
diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.cc b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
similarity index 78%
rename from paddle/infrt/dialect/infrt/infrt_dialect.cc
rename to paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
index 400e4921c9444..eb69a95c583f2 100644
--- a/paddle/infrt/dialect/infrt/infrt_dialect.cc
+++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
@@ -12,40 +12,52 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 
 #include <llvm/ADT/TypeSwitch.h>
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/BuiltinOps.h>
 #include <mlir/IR/DialectImplementation.h>
 #include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/infrt/infrt_opsDialect.cpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsDialect.cpp.inc"
 
 #define GET_TYPEDEF_CLASSES
-#include "paddle/infrt/dialect/infrt/infrt_opsTypes.cpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsTypes.cpp.inc"
 
 #define GET_ATTRDEF_CLASSES
-#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.cpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsAttributes.cpp.inc"
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/infrt/infrt_ops.cpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_ops.cpp.inc"
+
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+
+#include "paddle/infrt/dialect/infrt/ir/test_kernels.h"
 
 namespace infrt {
 
 void InfrtDialect::initialize() {
   addTypes<
 #define GET_TYPEDEF_LIST
-#include "paddle/infrt/dialect/infrt/infrt_opsTypes.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsTypes.cpp.inc"  // NOLINT
       >();
 
   addAttributes<
 #define GET_ATTRDEF_LIST
-#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsAttributes.cpp.inc"  // NOLINT
       >();
 
   addOperations<
 #define GET_OP_LIST
-#include "paddle/infrt/dialect/infrt/infrt_ops.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/infrt/ir/infrt_ops.cpp.inc"  // NOLINT
+      >();
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.cpp.inc"
+      >();
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/infrt/ir/test_kernels.cpp.inc"
       >();
 }
 
@@ -78,6 +90,9 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
     return LoDTensorType::get(
         parser.getContext(), shape, elementType, lod_level);
   }
+  if (keyword == "dense_tensor_map") {
+    return DenseHostTensorMapType::get(parser.getContext());
+  }
   if (keyword == "dense_tensor") {
     // parse DenseTensor, for example: !i=Infrt.tensor<X86, CUDA, F32>
     llvm::StringRef target;
@@ -122,13 +137,18 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
     return DenseTensorType::get(
         parser.getContext(), *targetType, *precisionType, *layoutType);
   }
+
+  if (keyword == "tensor_list") {
+    return infrt::DenseTensorListType::get(parser.getContext());
+  }
+
   // Todo: parse other type
   return mlir::Type();
 }
 
 void InfrtDialect::printType(::mlir::Type type,
                              ::mlir::DialectAsmPrinter &os) const {
-  // print LoDTensorType, for example: !Infrt.lod_tensor<3x64x3x3xf32,5>
+  // print LoDTensorType, for example: !infrt.lod_tensor<3x64x3x3xf32,5>
   if (type.isa<infrt::LoDTensorType>()) {
     auto lod_tensor_type = type.cast<infrt::LoDTensorType>();
     os << "lod_tensor<";
@@ -142,9 +162,13 @@ void InfrtDialect::printType(::mlir::Type type,
        << lod_tensor_type.getLod_level() << ">";
     return;
   }
+  if (type.isa<infrt::DenseHostTensorMapType>()) {
+    os << "dense_tensor_map";
+    return;
+  }
 
   // print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW>
-  if (type.isa<infrt::DenseTensorType>()) {
+  if (type.isa<DenseTensorType>()) {
     auto dense_tensor_type = type.cast<infrt::DenseTensorType>();
     os << "dense_tensor<" << dense_tensor_type.getTarget() << ", "
        << dense_tensor_type.getPrecision() << ", "
@@ -152,6 +176,10 @@ void InfrtDialect::printType(::mlir::Type type,
     return;
   }
 
+  if (type.isa<infrt::DenseTensorListType>()) {
+    os << "tensor_list";
+    return;
+  }
   llvm_unreachable("unknown infrt type.");
 }
 
diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.h b/paddle/infrt/dialect/infrt/ir/infrt_dialect.h
similarity index 77%
rename from paddle/infrt/dialect/infrt/infrt_dialect.h
rename to paddle/infrt/dialect/infrt/ir/infrt_dialect.h
index ed5b36e556149..3e6ea2a74c79d 100644
--- a/paddle/infrt/dialect/infrt/infrt_dialect.h
+++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.h
@@ -22,14 +22,14 @@
 #include <mlir/IR/Dialect.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
-#include "paddle/infrt/dialect/infrt/common_type.h"
+#include "paddle/infrt/dialect/infrt/common/types.h"
 
-#include "paddle/infrt/dialect/infrt/infrt_opsDialect.h.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsDialect.h.inc"
 #define GET_TYPEDEF_CLASSES
-#include "paddle/infrt/dialect/infrt/infrt_opsTypes.h.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsTypes.h.inc"
 
 #define GET_ATTRDEF_CLASSES
-#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.h.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsAttributes.h.inc"
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/infrt/infrt_ops.h.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_ops.h.inc"
diff --git a/paddle/infrt/dialect/infrt/ir/infrt_ops.td b/paddle/infrt/dialect/infrt/ir/infrt_ops.td
new file mode 100644
index 0000000000000..82eba2a1746cc
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/ir/infrt_ops.td
@@ -0,0 +1,61 @@
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
+
+// Op definition
+class Infrt_Op<string mnemonic, list<OpTrait> traits = []> : Op<Infrt_Dialect, mnemonic, traits> {
+
+  // Each registered op needs to provide all of a printer, parser and verifier.
+  // let printer = [{ return infrt::print(p, *this); }];
+  // let verifier = [{ return infrt::verify(*this); }];
+  // let parser = [{ return infrt::parse$cppClass(parser, result); }];
+}
+
+def Infrt_KernelOp : Infrt_Op<"kernel", [NoSideEffect]> {
+  let summary = "kernel op";
+  let description = [{kernel op!}];
+  let arguments = (ins Variadic<AnyType>:$operands,
+                       StrAttr:$name,
+                       OptionalAttr<DictionaryAttr>:$attrs);
+  let results = (outs Variadic<AnyType>);
+}
+
+def Infrt_ReturnOp : Infrt_Op<"return", [Terminator]> {
+  let summary = "host executor return operation";
+  let description = [{
+      The "infrt.return" operation represents a return operation within a function.
+
+        func @foo() : (i32, f8) {
+        infrt.return %0, %1 : i32, f8
+        }
+    }];
+
+  let arguments = (ins Variadic<AnyType>:$operands);
+
+  let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?";
+}
+
+def Infrt_CallOp : Infrt_Op<"call"> {
+  let summary = "call a host operation";
+  let description = [{
+      The "infrt.call" operation represents a direct call to a function. The operands and result types of the call must match the specified function type.
+
+          %2 = infrt.call @add(%0, %1) : (f32, f32) -> f32
+    }];
+
+  let arguments = (ins FlatSymbolRefAttr:$callee, Variadic<AnyType>:$operands);
+  let results = (outs Variadic<AnyType>);
+
+  //let extraClassDeclaration = [{
+  //    mlir::StringRef getCallee() { return callee(); }
+  //    mlir::FunctionType getCalleeType();
+  //  }];
+  let assemblyFormat = [{
+    $callee `(` $operands `)` attr-dict `:` functional-type($operands, results)
+  }];
+}
+
+def Infrt_TensorCastOp : Infrt_Op<"tensor_cast", [NoSideEffect]> {
+  let summary = "cast tensor type op";
+  let description = [{cast tensor type op!}];
+  let arguments = (ins AnyType:$input);
+  let results = (outs AnyType:$output);
+}
diff --git a/paddle/infrt/dialect/test_kernels.cc b/paddle/infrt/dialect/infrt/ir/test_kernels.cc
similarity index 96%
rename from paddle/infrt/dialect/test_kernels.cc
rename to paddle/infrt/dialect/infrt/ir/test_kernels.cc
index f0c4723b49a79..5f7f83a9dfa80 100644
--- a/paddle/infrt/dialect/test_kernels.cc
+++ b/paddle/infrt/dialect/infrt/ir/test_kernels.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/test_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/test_kernels.h"
 
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/OpDefinition.h>
@@ -147,7 +147,7 @@ static mlir::LogicalResult verify(BenchmarkOp op) {
   // Verify that the target benchmark region has exactly one return value.
   auto &region = op.region();
   auto &last_op = region.front().back();
-  if (last_op.getName().getStringRef() != "Infrt.return") {
+  if (last_op.getName().getStringRef() != "infrt.return") {
     return op.emitOpError("missing return statement");
   }
   if (last_op.getNumOperands() != 1) {
@@ -161,4 +161,4 @@ static mlir::LogicalResult verify(BenchmarkOp op) {
 }  // namespace infrt
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/test_kernels.cpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/test_kernels.cpp.inc"
diff --git a/paddle/infrt/dialect/test_kernels.h b/paddle/infrt/dialect/infrt/ir/test_kernels.h
similarity index 92%
rename from paddle/infrt/dialect/test_kernels.h
rename to paddle/infrt/dialect/infrt/ir/test_kernels.h
index 73c8a6fb387bc..1fe5020b24004 100644
--- a/paddle/infrt/dialect/test_kernels.h
+++ b/paddle/infrt/dialect/infrt/ir/test_kernels.h
@@ -17,4 +17,4 @@
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/test_kernels.hpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/test_kernels.hpp.inc"
diff --git a/paddle/infrt/dialect/test_kernels.td b/paddle/infrt/dialect/infrt/ir/test_kernels.td
similarity index 93%
rename from paddle/infrt/dialect/test_kernels.td
rename to paddle/infrt/dialect/infrt/ir/test_kernels.td
index 6e4bc26aa1496..0ce1f3f65e8f7 100644
--- a/paddle/infrt/dialect/test_kernels.td
+++ b/paddle/infrt/dialect/infrt/ir/test_kernels.td
@@ -4,12 +4,12 @@
 #else
 #define TEST_OPS
 
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
 // Base class for Test dialect ops.
 class Test_Op<string mnemonic, list<OpTrait> traits = []> :
-    Op<INFRT_Dialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
+    Op<Infrt_Dialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
 
   // Each registered op in the Test namespace needs to provide all of a printer,
   // parser and verifier.
@@ -45,7 +45,7 @@ def BenchmarkOp : Test_Op<"benchmark"> {
          // The following code benchmarks the infrt.add.i32 kernel.
          %x = infrt.add.i32 %c, %c
          // The benchmarked function needs to return exactly one value.
-         Infrt.return %x : i32
+         infrt.return %x : i32
        }
   }];
 
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
index ef702650b6f1b..3d825a9c762f4 100644
--- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
@@ -2,20 +2,20 @@
 #define INFRT_OP_FUSE
 
 include "mlir/Interfaces/SideEffectInterfaces.td"
-include "paddle/infrt/dialect/infrt/infrt_ops.td"
-include "paddle/infrt/dialect/pd_ops.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_ops.td"
+include "paddle/infrt/dialect/pd/ir/pd_ops.td"
 
-def FuseCvtTensorPattern : Pat<
-       (Infrt_CvtTensorOp (Infrt_CvtTensorOp $arg)),
-       (Infrt_CvtTensorOp $arg)>;
+def FuseTensorCastPattern : Pat<
+       (Infrt_TensorCastOp (Infrt_TensorCastOp $arg)),
+       (Infrt_TensorCastOp $arg)>;
 
-def FuseFeedCvtTensorPattern : Pat<
-       (Infrt_CvtTensorOp (PD_FeedOp $name)),
+def FuseFeedTensorCastPattern : Pat<
+       (Infrt_TensorCastOp (PD_FeedOp $name)),
        (PD_FeedOp $name)>;
 
 def TypesAreIdentical : Constraint<CPred<"$0.getType() == $1.getType()">>;
-def RedundantCvtTensorOptPattern : Pat<
-  (Infrt_CvtTensorOp:$res $arg), (replaceWithValue $arg),
+def RedundantTensorCastOptPattern : Pat<
+  (Infrt_TensorCastOp:$res $arg), (replaceWithValue $arg),
   [(TypesAreIdentical $res, $arg)]>;
 
 
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
index cb16e054418b3..eec0e0bc7c5ab 100644
--- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
@@ -15,8 +15,8 @@
 #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
 
 #include <mlir/Transforms/GreedyPatternRewriteDriver.h>
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 namespace {
 #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse.cpp.inc"  // NOLINT
 
@@ -27,8 +27,12 @@ struct InfrtOpFusePass
     : public mlir::PassWrapper<InfrtOpFusePass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "infrtOpFusePass"; }
+
+  llvm::StringRef getArgument() const override { return "infrt-op-fuse"; }
+
   void runOnFunction() override;
 };
+
 // Implementation of the InfrtOpFusePass.
 void InfrtOpFusePass::runOnFunction() {
   ::mlir::RewritePatternSet patterns(&getContext());
@@ -39,14 +43,18 @@ void InfrtOpFusePass::runOnFunction() {
   if (nullptr == terminator_op) return;
   for (auto operand : terminator_op->getOperands()) {
     auto *op1 = operand.getDefiningOp();
-    auto cvt_op = ::llvm::dyn_cast<::infrt::CvtTensorOp>(op1);
+    auto cvt_op = ::llvm::dyn_cast<::infrt::TensorCastOp>(op1);
     if (!cvt_op) continue;
     mlir::Value value = cvt_op.input();
     operand.replaceAllUsesWith(value);
     cvt_op.erase();
   }
 }
+
 }  // namespace
+
 std::unique_ptr<mlir::Pass> infrt::createInfrtOpFusePass() {
   return std::make_unique<InfrtOpFusePass>();
 }
+
+mlir::PassRegistration<InfrtOpFusePass> infrt_op_fuse_pass;
diff --git a/paddle/infrt/dialect/infrt_base.cc b/paddle/infrt/dialect/infrt_base.cc
deleted file mode 100644
index 8c595c06745f1..0000000000000
--- a/paddle/infrt/dialect/infrt_base.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/infrt/dialect/infrt_base.h"
-
-#include "paddle/infrt/dialect/basic_kernels.h"
-#include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/test_kernels.h"
-
-namespace infrt {
-namespace dialect {
-
-// ----INFRTDialect definition begin----
-void INFRTDialect::initialize() {
-  allowUnknownTypes();
-  allowUnknownOperations();
-
-  addTypes<infrt::dt::StringType>();
-  addTypes<infrt::dt::TensorMapType>();
-
-  addOperations<
-#define GET_OP_LIST
-#include "paddle/infrt/dialect/basic_kernels.cpp.inc"
-      >();
-  addOperations<
-#define GET_OP_LIST
-#include "paddle/infrt/dialect/test_kernels.cpp.inc"
-      >();
-}
-
-mlir::Type INFRTDialect::parseType(mlir::DialectAsmParser &parser) const {
-  llvm::StringRef keyword;
-  if (parser.parseKeyword(&keyword)) return mlir::Type();
-  // parse TensorMapType, for example: !infrt.tensor_map
-  if (keyword == "tensor_map") {
-    return infrt::dt::TensorMapType::get();
-  }
-  // parse StringType, for example: !infrt.string
-  if (keyword == "string") {
-    return infrt::dt::StringType::get();
-  }
-
-  parser.emitError(parser.getCurrentLocation(), "unknown infrt type: ")
-      << keyword;
-  return mlir::Type();
-}
-
-void INFRTDialect::printType(mlir::Type type,
-                             mlir::DialectAsmPrinter &printer) const {
-  // print TensorMapType, for example: !infrt.tensor_map
-  if (type.isa<infrt::dt::TensorMapType>()) {
-    printer << "tensor_map";
-    return;
-  }
-  // print StringType, for example: !infrt.string
-  if (type.isa<infrt::dt::StringType>()) {
-    printer << "string";
-    return;
-  }
-  llvm_unreachable("unknown infrt type.");
-}
-
-// ----INFRTDialect definition end----
-
-}  // namespace dialect
-}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt_base.h b/paddle/infrt/dialect/infrt_base.h
deleted file mode 100644
index 3ef73171dcdea..0000000000000
--- a/paddle/infrt/dialect/infrt_base.h
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <mlir/IR/Builders.h>
-#include <mlir/IR/Dialect.h>
-#include <mlir/IR/DialectImplementation.h>
-#include <mlir/IR/MLIRContext.h>
-#include <mlir/IR/TypeUtilities.h>
-#include <mlir/IR/Types.h>
-
-#include "paddle/infrt/dialect/infrt_base.hpp.inc"
-
-namespace infrt {
-namespace dialect {
-
-class INFRTDialect : public mlir::Dialect {
-  explicit INFRTDialect(mlir::MLIRContext *context)
-      : mlir::Dialect(
-            getDialectNamespace(), context, mlir::TypeID::get<INFRTDialect>()) {
-    initialize();
-  }
-
-  // parse types registered to the dialect.
-  mlir::Type parseType(mlir::DialectAsmParser &parser) const override;
-  // print types registered to the dialect.
-  void printType(mlir::Type type,
-                 mlir::DialectAsmPrinter &printer) const override;
-
-  void initialize();
-  friend class mlir::MLIRContext;
-
- public:
-  static ::llvm::StringRef getDialectNamespace() { return "Infrt"; }
-};
-}  // namespace dialect
-
-template <typename T>
-static mlir::IntegerAttr createI32Attr(mlir::OpBuilder &b,  // NOLINT
-                                       mlir::Location loc,
-                                       T constant) {
-  return b.getIntegerAttr(b.getI32Type(), constant);
-}
-
-template <typename T>
-static mlir::IntegerAttr createSI32Attr(mlir::OpBuilder &b,  // NOLINT
-                                        mlir::Location loc,
-                                        T constant) {
-  return b.getSI32IntegerAttr(constant);
-}
-
-template <typename T>
-static mlir::FloatAttr createF32Attr(mlir::OpBuilder &b,  // NOLINT
-                                     mlir::Location loc,
-                                     T constant) {
-  return b.getF32FloatAttr(constant);
-}
-
-static mlir::SmallVector<mlir::Value, 4> cvtValueToValueRange(
-    const mlir::Value &operand) {
-  return mlir::SmallVector<mlir::Value, 4>(1, operand);
-}
-
-static mlir::SmallVector<mlir::Value, 4> concatTwoValueRange(
-    mlir::ValueRange operand_0, mlir::ValueRange operand_1) {
-  mlir::SmallVector<mlir::Value, 4> operands;
-  operands.append(operand_0.begin(), operand_0.end());
-  operands.append(operand_1.begin(), operand_1.end());
-  return operands;
-}
-}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt_base.td b/paddle/infrt/dialect/infrt_base.td
deleted file mode 100644
index 0f50eb2d8fb4a..0000000000000
--- a/paddle/infrt/dialect/infrt_base.td
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef INFRT_BASE
-#define INFRT_BASE
-
-include "mlir/IR/OpBase.td"
-include "paddle/infrt/dialect/infrt/infrt_ops_base.td"
-
-def INFRT_Dialect : Dialect {
-  let name = "Infrt";
-
-  let description = [{
-    The INFRT host dialect.
-  }];
-
-  let cppNamespace = "::infrt::dialect";
-}
-
-// Type definitions
-def StringType :
-    Type<CPred<"$_self.isa<::infrt::dt::StringType>()">, "!infrt.string type">,
-    BuildableType<"$_builder.getType<::infrt::dt::StringType>()">;
-
-def TensorMapType :
-    Type<CPred<"$_self.isa<::infrt::dt::TensorMapType>()">, "!infrt.tensor_map type">,
-    BuildableType<"$_builder.getType<::infrt::dt::TensorMapType>()">;
-
-def BufferType : OpaqueType<"b", "buffer", "buffer">;
-
-class INFRT_createI32Attr<string value> : NativeCodeCall<
-    "infrt::createI32Attr($_builder, $_loc, " # value # ")">;
-
-class INFRT_createSI32Attr<string value> : NativeCodeCall<
-    "infrt::createSI32Attr($_builder, $_loc, " # value # ")">;
-
-class INFRT_createF32Attr<string value> : NativeCodeCall<
-    "infrt::createF32Attr($_builder, $_loc, " # value # ")">;
-
-def INFRT_cvtValueToValueRange : NativeCodeCall<
-    "infrt::cvtValueToValueRange($0)">;
-
-def INFRT_concatTwoValueRange : NativeCodeCall<
-    "infrt::concatTwoValueRange($0, $1)">;
-#endif  // INFRT_BASE
diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_dialects.cc
similarity index 77%
rename from paddle/infrt/dialect/init_infrt_dialects.cc
rename to paddle/infrt/dialect/init_dialects.cc
index 5eae01719361d..56c375c72d2bb 100644
--- a/paddle/infrt/dialect/init_infrt_dialects.cc
+++ b/paddle/infrt/dialect/init_dialects.cc
@@ -12,29 +12,31 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/init_infrt_dialects.h"
+#include "paddle/infrt/dialect/init_dialects.h"
 
 #include <glog/logging.h>
 
-#include "paddle/infrt/dialect/basic_kernels.h"
 #include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
-#include "paddle/infrt/dialect/infrt_base.h"
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
 #include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
 
 #include "paddle/infrt/dialect/tensor_shape.h"
+#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 void registerCinnDialects(mlir::DialectRegistry &registry) {  // NOLINT
   registry.insert<ts::TensorShapeDialect,
-                  dialect::INFRTDialect,
-                  infrt::InfrtDialect,
+                  InfrtDialect,
                   dt::DTDialect,
-                  mlir::pd::PaddleDialect,
+                  pd::PaddleDialect,
+                  trt::TensorRTDialect
 #ifdef INFRT_WITH_PHI
+                  ,
                   phi::PHIDenseTensorDialect,
                   phi::PHICPUKernelDialect,
                   phi::PHIGPUKernelDialect,
diff --git a/paddle/infrt/dialect/init_infrt_dialects.h b/paddle/infrt/dialect/init_dialects.h
similarity index 100%
rename from paddle/infrt/dialect/init_infrt_dialects.h
rename to paddle/infrt/dialect/init_dialects.h
diff --git a/paddle/infrt/dialect/mlir_loader.cc b/paddle/infrt/dialect/mlir_loader.cc
index 1d0696e77dcda..19b8cba12df86 100644
--- a/paddle/infrt/dialect/mlir_loader.cc
+++ b/paddle/infrt/dialect/mlir_loader.cc
@@ -28,7 +28,7 @@
 #include <vector>
 
 #include "paddle/infrt/dialect/diagnostic_utils.h"
-#include "paddle/infrt/dialect/init_infrt_dialects.h"
+#include "paddle/infrt/dialect/init_dialects.h"
 
 namespace infrt {
 namespace dialect {
@@ -63,6 +63,7 @@ mlir::OwningModuleRef LoadMlirFile(const std::string& file_name,
   mlir::DialectRegistry registry;
   registerCinnDialects(registry);
   context->appendDialectRegistry(registry);
+  context->loadAllAvailableDialects();
   mlir::ScopedDiagnosticHandler scope_handler(
       context, [](mlir::Diagnostic& diag) {
         if (diag.getSeverity() != mlir::DiagnosticSeverity::Error)
diff --git a/paddle/infrt/dialect/mlir_loader_test.cc b/paddle/infrt/dialect/mlir_loader_test.cc
index 2f721e49a6309..8ccb07161d364 100644
--- a/paddle/infrt/dialect/mlir_loader_test.cc
+++ b/paddle/infrt/dialect/mlir_loader_test.cc
@@ -22,7 +22,7 @@
 
 #include <string>
 
-#include "paddle/infrt/dialect/init_infrt_dialects.h"
+#include "paddle/infrt/dialect/init_dialects.h"
 
 namespace infrt {
 namespace dialect {
@@ -32,13 +32,13 @@ TEST(MlirLoader, basic) {
 
   auto source = R"ROC(
 func @main() -> f32 {
-  %v0 = Infrt.constant.f32 1.0
-  %v1 = Infrt.constant.f32 2.0
-  %value = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
 
-  "Infrt.print.f32"(%v0) : (f32) -> ()
+  "infrt.print.f32"(%v0) : (f32) -> ()
 
-  Infrt.return %value : f32
+  infrt.return %value : f32
 }
 )ROC";
 
diff --git a/paddle/infrt/dialect/opt.cc b/paddle/infrt/dialect/opt.cc
index 5bcf5a23f4c53..2006530958f0b 100644
--- a/paddle/infrt/dialect/opt.cc
+++ b/paddle/infrt/dialect/opt.cc
@@ -14,7 +14,7 @@
 
 #include <mlir/Support/MlirOptMain.h>
 #include <mlir/Transforms/Passes.h>
-#include "paddle/infrt/dialect/init_infrt_dialects.h"
+#include "paddle/infrt/dialect/init_dialects.h"
 
 int main(int argc, char **argv) {
   mlir::DialectRegistry registry;
diff --git a/paddle/infrt/dialect/pd/CMakeLists.txt b/paddle/infrt/dialect/pd/CMakeLists.txt
new file mode 100644
index 0000000000000..5f65336453fbd
--- /dev/null
+++ b/paddle/infrt/dialect/pd/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_subdirectory(common)
+add_subdirectory(ir)
+add_subdirectory(pass)
diff --git a/paddle/infrt/dialect/pd/common/CMakeLists.txt b/paddle/infrt/dialect/pd/common/CMakeLists.txt
new file mode 100644
index 0000000000000..ee1b0d4c30deb
--- /dev/null
+++ b/paddle/infrt/dialect/pd/common/CMakeLists.txt
@@ -0,0 +1,4 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    )
diff --git a/paddle/infrt/dialect/pd/ir/CMakeLists.txt b/paddle/infrt/dialect/pd/ir/CMakeLists.txt
new file mode 100644
index 0000000000000..8aacfc97623c0
--- /dev/null
+++ b/paddle/infrt/dialect/pd/ir/CMakeLists.txt
@@ -0,0 +1,7 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    pd_ops.cc
+    )
+add_mlir_dialect(pd_ops pd)
+mlir_tablegen_on(pd_extra_ops)
diff --git a/paddle/infrt/dialect/pd_extra_ops.td b/paddle/infrt/dialect/pd/ir/pd_extra_ops.td
similarity index 90%
rename from paddle/infrt/dialect/pd_extra_ops.td
rename to paddle/infrt/dialect/pd/ir/pd_extra_ops.td
index c6d3f530455f7..cf17db211cbe9 100644
--- a/paddle/infrt/dialect/pd_extra_ops.td
+++ b/paddle/infrt/dialect/pd/ir/pd_extra_ops.td
@@ -4,7 +4,7 @@
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/IR/OpBase.td"
-include "paddle/infrt/dialect/pd_op_base.td"
+include "paddle/infrt/dialect/pd/ir/pd_op_base.td"
 
 def PD_FusedFC : PD_Op<"FC", [NoSideEffect]> {
     let summary = "Computes the Fully Connected result of two tensors";
diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd/ir/pd_op_base.td
similarity index 83%
rename from paddle/infrt/dialect/pd_op_base.td
rename to paddle/infrt/dialect/pd/ir/pd_op_base.td
index 26425e3945caa..e28854a848023 100644
--- a/paddle/infrt/dialect/pd_op_base.td
+++ b/paddle/infrt/dialect/pd/ir/pd_op_base.td
@@ -6,9 +6,9 @@
 
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
-include "paddle/infrt/dialect/infrt/infrt_ops_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 
-def PD_Dialect : Dialect {
+def Paddle_Dialect : Dialect {
   let name = "pd";
 
   let description = [{
@@ -16,16 +16,16 @@ def PD_Dialect : Dialect {
 
     This dialect contains the PaddlePaddle operators.
   }];
-
-  let cppNamespace = "mlir::pd";
+  let hasConstantMaterializer = 1;
+  let cppNamespace = "infrt::pd";
 }
 
 class PD_Op<string mnemonic, list<OpTrait> traits = []> :
-      Op<PD_Dialect, mnemonic, traits>;
+      Op<Paddle_Dialect, mnemonic, traits>;
 
 
 class PD_PaddleAttr <string name, string description> :
-      Attr<CPred<"$_self.isa<mlir::pd::" # name # "Attr>()">,
+      Attr<CPred<"$_self.isa<infrt::pd::" # name # "Attr>()">,
           "PaddlePaddle " # description # " attribute">;
 
 
@@ -33,12 +33,12 @@ class PD_PaddleAttr <string name, string description> :
 // PaddlePaddle type definitions
 //===----------------------------------------------------------------------===//
 
-def PD_PDDialectType : Type<CPred<"$_self.isa<mlir::pd::PDType>()">, "PaddlePaddle type">;
+def PD_PDDialectType : Type<CPred<"$_self.isa<infrt::pd::PDType>()">, "PaddlePaddle type">;
 
 class PD_PaddleType <string name, string description> :
-      Type<CPred<"$_self.isa<mlir::pd::" # name #"Type>()">,
+      Type<CPred<"$_self.isa<infrt::pd::" # name #"Type>()">,
          "Paddle " # description # " type">,
-      BuildableType<"getType<mlir::pd::" # name # "Type>()">;
+      BuildableType<"getType<infrt::pd::" # name # "Type>()">;
 
 //===----------------------------------------------------------------------===//
 // Integer types
diff --git a/paddle/infrt/dialect/pd/ir/pd_ops.cc b/paddle/infrt/dialect/pd/ir/pd_ops.cc
new file mode 100644
index 0000000000000..b5ba48581ee62
--- /dev/null
+++ b/paddle/infrt/dialect/pd/ir/pd_ops.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
+
+#include <mlir/IR/Matchers.h>
+#include <mlir/IR/PatternMatch.h>
+
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd/ir/pd_opsDialect.cpp.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd/ir/pd_ops.cpp.inc"  // NOLINT
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.cpp.inc"  // NOLINT
+
+namespace infrt {
+namespace pd {
+void PaddleDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/pd/ir/pd_ops.cpp.inc"  // NOLINT
+      ,
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.cpp.inc"  // NOLINT
+      >();
+}
+
+mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder,
+                                                    mlir::Attribute value,
+                                                    mlir::Type type,
+                                                    mlir::Location loc) {
+  return builder.create<ConstantOp>(loc, value);
+}
+
+void ConstantOp::build(mlir::OpBuilder &builder,
+                       mlir::OperationState &state,
+                       mlir::Attribute value) {
+  if (auto elem_attr = value.dyn_cast<mlir::ElementsAttr>()) {
+    return ConstantOp::build(builder, state, elem_attr);
+  } else if (value.isa<mlir::BoolAttr, mlir::FloatAttr, mlir::IntegerAttr>()) {
+    mlir::ShapedType type =
+        mlir::RankedTensorType::get(/*shape=*/{}, value.getType());
+    state.addAttribute("value", mlir::DenseElementsAttr::get(type, value));
+    state.addTypes(type);
+    return;
+  }
+  llvm_unreachable("unsupported attribute type for building pd.constant");
+}
+
+mlir::LogicalResult ConstantOp::inferReturnTypes(
+    mlir::MLIRContext *context,
+    mlir::Optional<mlir::Location> location,
+    mlir::ValueRange operands,
+    mlir::DictionaryAttr attributes,
+    mlir::RegionRange regions,
+    llvm::SmallVectorImpl<mlir::Type> &inferredReturnTypes) {
+  inferredReturnTypes.push_back(attributes.get("value").getType());
+  return mlir::success();
+}
+mlir::OpFoldResult ConstantOp::fold(
+    ::llvm::ArrayRef<mlir::Attribute> operands) {
+  return value();
+}
+}  // namespace pd
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/pd/ir/pd_ops.h b/paddle/infrt/dialect/pd/ir/pd_ops.h
new file mode 100644
index 0000000000000..8383ff6ed8201
--- /dev/null
+++ b/paddle/infrt/dialect/pd/ir/pd_ops.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+//===----------------------------------------------------------------------===//
+// Dialect
+//===----------------------------------------------------------------------===//
+#include <llvm/ADT/StringMap.h>
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/Interfaces/InferTypeOpInterface.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd/ir/pd_opsDialect.h.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.hpp.inc"
diff --git a/paddle/infrt/dialect/pd/pass/CMakeLists.txt b/paddle/infrt/dialect/pd/pass/CMakeLists.txt
new file mode 100644
index 0000000000000..827df597b76e2
--- /dev/null
+++ b/paddle/infrt/dialect/pd/pass/CMakeLists.txt
@@ -0,0 +1,8 @@
+
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    pd_op_fuse_pass.cc
+    )
+
+mlir_add_rewriter(pd_op_fuse)
diff --git a/paddle/infrt/dialect/rewrite.td b/paddle/infrt/dialect/pd/pass/pd_op_fuse.td
similarity index 95%
rename from paddle/infrt/dialect/rewrite.td
rename to paddle/infrt/dialect/pd/pass/pd_op_fuse.td
index 5e228fed4d57e..f5a8ea78d7d9d 100644
--- a/paddle/infrt/dialect/rewrite.td
+++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse.td
@@ -1,10 +1,10 @@
 #ifndef INFRT_REWRITE
 #define INFRT_REWRITE
 
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
-include "paddle/infrt/dialect/pd_ops.td"
-include "paddle/infrt/dialect/pd_extra_ops.td"
+include "paddle/infrt/dialect/pd/ir/pd_ops.td"
+include "paddle/infrt/dialect/pd/ir/pd_extra_ops.td"
 
 //===----------------------------------------------------------------------===//
 // This is to fuse the composition: 'Matmul o ElementwiseAdd' into 'PD_FusedFC'.
diff --git a/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc
new file mode 100644
index 0000000000000..8bdf957db27d8
--- /dev/null
+++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h"  // NOLINT
+
+#include <mlir/Transforms/GreedyPatternRewriteDriver.h>
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
+
+namespace {
+#include "paddle/infrt/dialect/pd/pass/pd_op_fuse.cpp.inc"  // NOLINT
+
+/*
+ * PdOpFusePass.
+ */
+struct PdOpFusePass
+    : public mlir::PassWrapper<PdOpFusePass, mlir::FunctionPass> {
+ public:
+  ::llvm::StringRef getName() const override { return "PdOpFusePass"; }
+
+  llvm::StringRef getArgument() const override { return "pd-op-fuse"; }
+
+  void runOnFunction() override;
+};
+
+// Implementation of the PdOpFusePass.
+void PdOpFusePass::runOnFunction() {
+  ::mlir::RewritePatternSet patterns(&getContext());
+  populateWithGenerated(patterns);
+  (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+}
+
+}  // namespace
+
+mlir::PassRegistration<PdOpFusePass> infrt_op_fuse_pass;
diff --git a/paddle/infrt/kernel/phi/allocator_kernels.cc b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h
similarity index 77%
rename from paddle/infrt/kernel/phi/allocator_kernels.cc
rename to paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h
index eba12e688b4ae..854545ab1a263 100644
--- a/paddle/infrt/kernel/phi/allocator_kernels.cc
+++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/kernel/phi/allocator_kernels.h"
+#pragma once
+#include <mlir/Pass/Pass.h>
 
 namespace infrt {
-namespace kernel {
-namespace phi {
+/*
+ * PdOpFusePass.
+ */
+std::unique_ptr<mlir::Pass> CreatePdOpFusePass();
 
-backends::CpuPhiAllocator CreateCpuAllocator() { return {}; }
-
-}  // namespace phi
-}  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc
deleted file mode 100644
index 338b04e001320..0000000000000
--- a/paddle/infrt/dialect/pd_ops.cc
+++ /dev/null
@@ -1,180 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/infrt/dialect/pd_ops.h"
-
-#include <mlir/IR/Matchers.h>
-#include <mlir/IR/PatternMatch.h>
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
-#include "paddle/infrt/dialect/infrt_base.h"
-
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_ops.cpp.inc"  // NOLINT
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_extra_ops.cpp.inc"  // NOLINT
-
-namespace mlir {
-namespace pd {
-
-#include "paddle/infrt/dialect/rewrite.cpp.inc"  // NOLINT
-
-PaddleDialect::PaddleDialect(MLIRContext *context)
-    : Dialect("pd", context, TypeID::get<PaddleDialect>()) {
-  addOperations<
-#define GET_OP_LIST
-#include "paddle/infrt/dialect/pd_ops.cpp.inc"  // NOLINT
-      ,
-#define GET_OP_LIST
-#include "paddle/infrt/dialect/pd_extra_ops.cpp.inc"  // NOLINT
-      >();
-}
-
-mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder,
-                                                    mlir::Attribute value,
-                                                    mlir::Type type,
-                                                    mlir::Location loc) {
-  return builder.create<ConstantOp>(loc, value);
-}
-
-void ConstantOp::build(OpBuilder &builder,
-                       OperationState &state,
-                       Attribute value) {
-  if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
-    return ConstantOp::build(builder, state, elem_attr);
-  } else if (value.isa<BoolAttr, FloatAttr, IntegerAttr>()) {
-    ShapedType type = RankedTensorType::get(/*shape=*/{}, value.getType());
-    state.addAttribute("value", DenseElementsAttr::get(type, value));
-    state.addTypes(type);
-    return;
-  }
-  llvm_unreachable("unsupported attribute type for building pd.constant");
-}
-
-LogicalResult ConstantOp::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(attributes.get("value").getType());
-  return success();
-}
-mlir::OpFoldResult ConstantOp::fold(
-    ::llvm::ArrayRef<mlir::Attribute> operands) {
-  return value();
-}
-/*
-LogicalResult ElementwiseAdd::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands[0].getType());
-  return success();
-}
-*/
-
-void Elementwise_addOp::getCanonicalizationPatterns(
-    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
-  results.insert<FuseMulAdd>(context);
-}
-
-/*
-mlir::OpFoldResult ElementwiseAdd::fold(
-    llvm::ArrayRef<mlir::Attribute> operands) {
-  if (getElementTypeOrSelf(getType()).isa<FloatType>()) {
-    if (!operands[0] || !operands[1]) return {};
-    DenseElementsAttr lhs = operands[0].dyn_cast<DenseElementsAttr>();
-    DenseElementsAttr rhs = operands[1].dyn_cast<DenseElementsAttr>();
-    if (!lhs || !rhs) return {};
-    ShapedType type = getType().template cast<ShapedType>();
-    if (!type.hasStaticShape()) return {};
-    Type etype = type.getElementType();
-    if (!etype.isa<FloatType>()) return {};
-    SmallVector<APFloat, 6> values;
-    values.reserve(lhs.getNumElements());
-    for (const auto zip :
-         llvm::zip(lhs.getValues<APFloat>(), rhs.getValues<APFloat>())) {
-      values.push_back(
-          std::plus<APFloat>()(std::get<0>(zip), std::get<1>(zip)));
-    }
-    return DenseElementsAttr::get(type, values);
-  }
-  return {};
-}
-
-LogicalResult ElementwiseDiv::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands[0].getType());
-  return success();
-}
-
-LogicalResult ElementwiseMul::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands[0].getType());
-  return success();
-}
-
-LogicalResult ElementwiseSub::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands[0].getType());
-  return success();
-}
-
-LogicalResult MulOp::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands[0].getType());
-  return success();
-}
-
-void ReluOp::getCanonicalizationPatterns(
-    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
-  results.insert<FuseFCRelu>(context);
-}
-
-void FusedRepeatedFCRelu::getCanonicalizationPatterns(
-    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
-  results.insert<FuseRepeatedFCRelu2>(context);
-}
-
-void BatchNormOp::getCanonicalizationPatterns(
-    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
-  results.insert<FuseBatchNormWithConvPattern>(context);
-}*/
-
-}  // namespace pd
-}  // namespace mlir
diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd_ops.h
deleted file mode 100644
index b48c68060d42e..0000000000000
--- a/paddle/infrt/dialect/pd_ops.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <mlir/Dialect/Traits.h>
-#include <mlir/IR/Attributes.h>
-#include <mlir/IR/Builders.h>
-#include <mlir/IR/BuiltinOps.h>
-#include <mlir/IR/BuiltinTypes.h>
-#include <mlir/IR/Dialect.h>
-#include <mlir/IR/Matchers.h>
-#include <mlir/IR/OpImplementation.h>
-#include <mlir/IR/TypeUtilities.h>
-#include <mlir/Interfaces/CallInterfaces.h>
-#include <mlir/Interfaces/DerivedAttributeOpInterface.h>
-#include <mlir/Interfaces/InferTypeOpInterface.h>
-#include <mlir/Interfaces/LoopLikeInterface.h>
-#include <mlir/Interfaces/SideEffectInterfaces.h>
-
-namespace mlir {
-namespace pd {
-
-class PaddleDialect : public Dialect {
- public:
-  explicit PaddleDialect(MLIRContext* context);
-
-  static StringRef getDialectNamespace() { return "pd"; }
-
-  /// A hook used to materialize constant values with the given type.
-  Operation* materializeConstant(OpBuilder& builder,
-                                 Attribute value,
-                                 Type type,
-                                 Location loc) override;
-
-  Type parseType(DialectAsmParser& parser) const override {
-    return Dialect::parseType(parser);
-  }
-  void printType(Type type, DialectAsmPrinter& printer) const override {
-    Dialect::printType(type, printer);
-  }
-};
-
-}  // namespace pd
-}  // namespace mlir
-
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_ops.hpp.inc"
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_extra_ops.hpp.inc"
diff --git a/paddle/infrt/dialect/pd_types.h b/paddle/infrt/dialect/pd_types.h
deleted file mode 100644
index 0da888a9c0769..0000000000000
--- a/paddle/infrt/dialect/pd_types.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file defines the types used in PaddlePaddle MLIR dialect.
-// We borrowed much ideas from tensorflow mlir dialect (tf_types.h in
-// tensorflow).
-
-#pragma once
-
-#include <mlir/IR/Diagnostics.h>
-#include <mlir/IR/Location.h>
-#include <mlir/IR/Operation.h>
-#include <mlir/IR/TypeUtilities.h>
-#include <mlir/IR/Types.h>
-
-namespace mlir {
-namespace PD {
-
-class PaddleType : public Type {
- public:
-  using Type::Type;
-
-  static bool classof(Type type);
-};
-
-namespace detail {
-
-template <typename Derived>
-class PaddleTypeImpl : public Type::TypeBase<Derived, PaddleType, TypeStorage> {
- public:
-  using Base = typename Type::TypeBase<Derived, PaddleType, TypeStorage>;
-  using PDBase = PaddleTypeImpl<Derived>;
-  using Base::Base;
-};
-
-}  // namespace detail
-
-#define HANDLE_PD_TYPE(pdtype, enumerant, name)                      \
-  class pdtype##Type : public detail::PaddleTypeImpl<pdtype##Type> { \
-   public:                                                           \
-    using PDBase::PDBase;                                            \
-  };
-
-}  // namespace PD
-}  // namespace mlir
diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt
index a2677a946cb7e..67f6bb8a2d7bb 100644
--- a/paddle/infrt/dialect/phi/CMakeLists.txt
+++ b/paddle/infrt/dialect/phi/CMakeLists.txt
@@ -5,8 +5,8 @@ endif()
 add_subdirectory(ir)
 add_subdirectory(pass)
 
-add_executable(phi-ir-exec phi_ir_exec.cc)
-target_link_libraries(phi-ir-exec infrt)
-
 add_executable(phi-exec phi_exec.cc)
 target_link_libraries(phi-exec infrt)
+
+gather_srcs(infrt_src SRCS
+    data_type.cc)
diff --git a/paddle/infrt/dialect/phi/data_type.cc b/paddle/infrt/dialect/phi/data_type.cc
new file mode 100644
index 0000000000000..bbc296ea748a3
--- /dev/null
+++ b/paddle/infrt/dialect/phi/data_type.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/phi/data_type.h"
+
+namespace infrt {
+
+phi::Backend ConvertTargetToPhi(TargetType target) {
+  switch (target) {
+    case TargetType::CPU:
+      return phi::Backend::CPU;
+    case TargetType::GPU:
+      return phi::Backend::GPU;
+    default:
+      return phi::Backend::UNDEFINED;
+  }
+}
+
+TargetType ConvertTargetFromPhi(phi::Backend backend) {
+  switch (backend) {
+    case phi::Backend::CPU:
+      return TargetType::CPU;
+    case phi::Backend::GPU:
+      return TargetType::GPU;
+    default:
+      return TargetType::UNK;
+  }
+}
+
+phi::DataType ConvertPrecisionToPhi(PrecisionType precision) {
+#define CONVERT_PRECISION_TO_PHI(Precision) \
+  case PrecisionType::Precision:            \
+    return phi::DataType::Precision;
+
+  switch (precision) {
+    CONVERT_PRECISION_TO_PHI(FLOAT32)
+    CONVERT_PRECISION_TO_PHI(FLOAT16)
+    CONVERT_PRECISION_TO_PHI(FLOAT64)
+    CONVERT_PRECISION_TO_PHI(UINT8)
+    CONVERT_PRECISION_TO_PHI(INT8)
+    CONVERT_PRECISION_TO_PHI(INT16)
+    CONVERT_PRECISION_TO_PHI(INT32)
+    CONVERT_PRECISION_TO_PHI(INT64)
+    CONVERT_PRECISION_TO_PHI(COMPLEX64)
+    CONVERT_PRECISION_TO_PHI(COMPLEX128)
+    CONVERT_PRECISION_TO_PHI(BOOL)
+    default:
+      return phi::DataType::UNDEFINED;
+  }
+#undef CONVERT_PRECISION_TO_PHI
+}
+
+PrecisionType ConvertPrecisionFromPhi(phi::DataType datatype) {
+#define CONVERT_PRECISION_FROM_PHI(Precision) \
+  case phi::DataType::Precision:              \
+    return PrecisionType::Precision;
+
+  switch (datatype) {
+    CONVERT_PRECISION_FROM_PHI(FLOAT32)
+    CONVERT_PRECISION_FROM_PHI(FLOAT16)
+    CONVERT_PRECISION_FROM_PHI(FLOAT64)
+    CONVERT_PRECISION_FROM_PHI(UINT8)
+    CONVERT_PRECISION_FROM_PHI(INT8)
+    CONVERT_PRECISION_FROM_PHI(INT16)
+    CONVERT_PRECISION_FROM_PHI(INT32)
+    CONVERT_PRECISION_FROM_PHI(INT64)
+    CONVERT_PRECISION_FROM_PHI(COMPLEX64)
+    CONVERT_PRECISION_FROM_PHI(COMPLEX128)
+    CONVERT_PRECISION_FROM_PHI(BOOL)
+    default:
+      return PrecisionType::UNK;
+  }
+#undef CONVERT_PRECISION_FROM_PHI
+}
+
+phi::DataLayout ConvertLayoutToPhi(LayoutType layout) {
+  switch (layout) {
+    case LayoutType::NCHW:
+      return phi::DataLayout::NCHW;
+    case LayoutType::NHWC:
+      return phi::DataLayout::NHWC;
+    case LayoutType::ANY:
+      return phi::DataLayout::ANY;
+    default:
+      return phi::DataLayout::UNDEFINED;
+  }
+}
+
+LayoutType ConvertLayoutFromPhi(phi::DataLayout layout) {
+  switch (layout) {
+    case phi::DataLayout::NCHW:
+      return LayoutType::NCHW;
+    case phi::DataLayout::NHWC:
+      return LayoutType::NHWC;
+    case phi::DataLayout::ANY:
+      return LayoutType::ANY;
+    default:
+      return LayoutType::UNK;
+  }
+}
+
+phi::KernelKey ConvertPlaceToPhi(const Place& place) {
+  return phi::KernelKey(ConvertTargetToPhi(place.target),
+                        ConvertLayoutToPhi(place.layout),
+                        ConvertPrecisionToPhi(place.precision));
+}
+
+Place ConvertPlaceFromPhi(phi::TensorArgDef tensor_arg) {
+  return Place(ConvertTargetFromPhi(tensor_arg.backend),
+               ConvertPrecisionFromPhi(tensor_arg.dtype),
+               ConvertLayoutFromPhi(tensor_arg.layout));
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/data_type.h b/paddle/infrt/dialect/phi/data_type.h
new file mode 100644
index 0000000000000..8e831c8c27d50
--- /dev/null
+++ b/paddle/infrt/dialect/phi/data_type.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/infrt/dialect/infrt/common/types.h"
+#include "paddle/phi/common/backend.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_factory.h"
+
+namespace infrt {
+
+::phi::Backend ConvertTargetToPhi(TargetType target);
+TargetType ConvertTargetFromPhi(::phi::Backend backend);
+
+::phi::DataType ConvertPrecisionToPhi(PrecisionType precision);
+PrecisionType ConvertPrecisionFromPhi(::phi::DataType datatype);
+
+::phi::DataLayout ConvertLayoutToPhi(LayoutType layout);
+LayoutType ConvertLayoutFromPhi(::phi::DataLayout layout);
+
+::phi::KernelKey ConvertPlaceToPhi(const Place& place);
+Place ConvertPlaceFromPhi(::phi::TensorArgDef tensor_arg);
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
index 671646b9259cc..8e21283183d03 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
@@ -2,7 +2,7 @@
 #define PHI_BASE
 
 include "mlir/IR/OpBase.td"
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 
 def PHI_Dialect : Dialect {
@@ -37,4 +37,8 @@ def Allocator : PHI_Type<"Allocator"> {
    let assemblyFormat = "`<` $target `>`";
  }
 
+def PD_DenseTensorMap : PHI_Type<"DenseTensorMap"> {
+  let mnemonic = "dense_tensor_map";
+}
+
 #endif
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
index ee23470fc754a..d2ff7acfba8b2 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
@@ -3,7 +3,7 @@
 
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
 
 def PHI_CPUKernelDialect : Dialect {
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
index 3399c408d9b5a..3af7033d2f4c7 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
@@ -5,7 +5,7 @@
 include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 
 def PHI_DenseTensorDialect : Dialect {
   let name = "phi_dt";
@@ -18,12 +18,13 @@ def PHI_DenseTensorDialect : Dialect {
 }
 
 // PHI DenseTensor related Op.
-class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PHI_DenseTensorDialect, mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {
-}
+class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PHI_DenseTensorDialect,
+  mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {}
 
-class CreateDenseTensorOp<string place, string dtype, string layout> 
-      : PDT_Op<"create_dense_tensor." # place # "." # dtype # "." # layout, [NoSideEffect]> {
-  let arguments = (ins Allocator:$allocator, I64ArrayAttr:$dims, I64ArrayAttr:$lod);
+class CreateDenseTensorOp<string target>
+      : PDT_Op<"create_dense_tensor." # target, [NoSideEffect]> {
+  let arguments = (ins Context:$context, I64ArrayAttr:$dims, 
+    LayoutAttr:$layout, I64ArrayAttr:$lod, PrecisionAttr:$precision);
   let results = (outs DenseTensor:$output);
 }
 
@@ -44,23 +45,52 @@ class PrintDenseTensorOp:
   let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict";
 }
 
-class CreateCPUAllocatorOp
-      : PDT_Op<"create_allocator." # "cpu", [NoSideEffect]> {
+class CreateContextOp<string target>
+      : PDT_Op<"create_context." # target, [NoSideEffect]> {
   let arguments = (ins);
-  let results = (outs Allocator:$output);
+  let results = (outs Context:$output);
 }
 
-class CreateCPUContextOp
-      : PDT_Op<"create_context." # "cpu", [NoSideEffect]> {
-  let arguments = (ins Allocator:$input);
-  let results = (outs Context:$output);
+def PDT_LoadParamsOp : PDT_Op<"load_params", [NoSideEffect]> {
+  // input path of model params.
+  let arguments = (ins StrAttr:$path);
+  let results = (outs PD_DenseTensorMap:$out);
+
+  let assemblyFormat = "`(``)`attr-dict";
+}
+
+def PDT_LoadCombinedParamsOp : PDT_Op<"load_combined_params", [NoSideEffect]> {
+  // input path of model params.
+  let arguments = (ins StrAttr:$model_path, StrAttr:$params_path);
+  let results = (outs PD_DenseTensorMap:$out);
+
+  let assemblyFormat = "`(``)`attr-dict";
+}
+
+def PDT_TensorMapGetSizeOp : PDT_Op<"tensor_map_get_size", [NoSideEffect]> {
+  let arguments = (ins PD_DenseTensorMap:$map);
+  let results = (outs I32:$size);
+  let assemblyFormat = "`(` $map `)` attr-dict `->` type($size)";
+}
+
+class TensorMapGetTensorOp:
+      PDT_Op<"tensor_map_get_tensor"> {
+  let arguments = (ins
+          PD_DenseTensorMap:$map,
+          StrAttr:$name
+          );
+  let results = (outs DenseTensor:$output);
+  let assemblyFormat = "`(` operands `)` attr-dict `->` type($output)";
+  let verifier = ?;
 }
 
-def PDT_CreateDenseTensorOp_cpu_f32_nchw : CreateDenseTensorOp<"cpu", "f32", "nchw">;
+def PDT_CreateCPUDenseTensorOp : CreateDenseTensorOp<"cpu">;
+def PDT_CreateGPUDenseTensorOp : CreateDenseTensorOp<"gpu">;
 def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp<F32ArrayAttr, "f32">;
-def PDT_CreateAllocatorOp_cpu : CreateCPUAllocatorOp;
-def PDT_CreateContextOp_cpu : CreateCPUContextOp;
-def PDT_PrintDenseTensor_cpu : PrintDenseTensorOp;
+def PDT_CreateCPUContextOp : CreateContextOp<"cpu">;
+def PDT_CreateGPUContextOp : CreateContextOp<"gpu">;
+def PDT_PrintDenseTensor : PrintDenseTensorOp;
+def PDT_TensorMapGetTensorOp: TensorMapGetTensorOp;
 
 def FakeKernelOp : PDT_Op<"fake_phi_kernel"> {
   let arguments = (ins Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y);
diff --git a/paddle/infrt/dialect/phi/ir/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc
index d8095d7f3f13f..f91381fe72903 100644
--- a/paddle/infrt/dialect/phi/ir/phi_base.cc
+++ b/paddle/infrt/dialect/phi/ir/phi_base.cc
@@ -29,6 +29,7 @@ namespace infrt {
 namespace phi {
 
 void PHIDialect::initialize() {
+  LOG(INFO) << "PHI Dialect initalized";
   addOperations<
 #define GET_OP_LIST
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc"  // NOLINT
diff --git a/paddle/infrt/dialect/phi/ir/phi_base.h b/paddle/infrt/dialect/phi/ir/phi_base.h
index 0ea1973a7331b..64cd08cc05ed4 100644
--- a/paddle/infrt/dialect/phi/ir/phi_base.h
+++ b/paddle/infrt/dialect/phi/ir/phi_base.h
@@ -18,7 +18,7 @@
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
 #include <string>
-#include "paddle/infrt/dialect/infrt/common_type.h"
+#include "paddle/infrt/dialect/infrt/common/types.h"
 
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.h.inc"
 
diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.h b/paddle/infrt/dialect/phi/ir/phi_kernels.h
index b84d1b2b7294b..4f8b41852cc67 100644
--- a/paddle/infrt/dialect/phi/ir/phi_kernels.h
+++ b/paddle/infrt/dialect/phi/ir/phi_kernels.h
@@ -30,7 +30,7 @@
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
 #include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
 
 #include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.h.inc"
diff --git a/paddle/infrt/dialect/phi/pass/CMakeLists.txt b/paddle/infrt/dialect/phi/pass/CMakeLists.txt
index 5c55a6b0acaed..dc60ecf63fe2e 100644
--- a/paddle/infrt/dialect/phi/pass/CMakeLists.txt
+++ b/paddle/infrt/dialect/phi/pass/CMakeLists.txt
@@ -2,6 +2,8 @@ core_gather_headers()
 
 gather_srcs(infrt_src SRCS
     proto_arg_map_context.cc
-    phi_op_cvt_pass.cc
+    phi_op_convert_pass.cc
     kernel_op_desc.cc
-    )
+   )
+
+cc_test(test_kernel_op_desc SRCS kernel_op_desc_test.cc DEPS infrt)
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
index 12a6cfcc3e4a8..a26e8e2dca570 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
@@ -14,119 +14,10 @@
 
 #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
 #include <glog/logging.h>
-#include "paddle/phi/core/kernel_factory.h"
-#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/infrt/dialect/phi/data_type.h"
 #include "paddle/phi/kernels/declarations.h"
 
 namespace infrt {
-namespace {
-phi::Backend cvtTarget2Phi(TargetType target) {
-  switch (target) {
-    case TargetType::CPU:
-      return phi::Backend::CPU;
-    case TargetType::GPU:
-      return phi::Backend::GPU;
-    default:
-      return phi::Backend::UNDEFINED;
-  }
-}
-
-TargetType cvtTargetFromPhi(phi::Backend backend) {
-  switch (backend) {
-    case phi::Backend::CPU:
-      return TargetType::CPU;
-    case phi::Backend::GPU:
-      return TargetType::GPU;
-    default:
-      return TargetType::UNK;
-  }
-}
-
-phi::DataType cvtPrecision2Phi(PrecisionType precision) {
-#define CONVERT_PRECISION_TO_PHI(Precision) \
-  case PrecisionType::Precision:            \
-    return phi::DataType::Precision;
-
-  switch (precision) {
-    CONVERT_PRECISION_TO_PHI(FLOAT32)
-    CONVERT_PRECISION_TO_PHI(FLOAT16)
-    CONVERT_PRECISION_TO_PHI(FLOAT64)
-    CONVERT_PRECISION_TO_PHI(UINT8)
-    CONVERT_PRECISION_TO_PHI(INT8)
-    CONVERT_PRECISION_TO_PHI(INT16)
-    CONVERT_PRECISION_TO_PHI(INT32)
-    CONVERT_PRECISION_TO_PHI(INT64)
-    CONVERT_PRECISION_TO_PHI(COMPLEX64)
-    CONVERT_PRECISION_TO_PHI(COMPLEX128)
-    CONVERT_PRECISION_TO_PHI(BOOL)
-    default:
-      return phi::DataType::UNDEFINED;
-  }
-#undef CONVERT_PRECISION_TO_PHI
-}
-
-PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) {
-#define CONVERT_PRECISION_FROM_PHI(Precision) \
-  case phi::DataType::Precision:              \
-    return PrecisionType::Precision;
-
-  switch (datatype) {
-    CONVERT_PRECISION_FROM_PHI(FLOAT32)
-    CONVERT_PRECISION_FROM_PHI(FLOAT16)
-    CONVERT_PRECISION_FROM_PHI(FLOAT64)
-    CONVERT_PRECISION_FROM_PHI(UINT8)
-    CONVERT_PRECISION_FROM_PHI(INT8)
-    CONVERT_PRECISION_FROM_PHI(INT16)
-    CONVERT_PRECISION_FROM_PHI(INT32)
-    CONVERT_PRECISION_FROM_PHI(INT64)
-    CONVERT_PRECISION_FROM_PHI(COMPLEX64)
-    CONVERT_PRECISION_FROM_PHI(COMPLEX128)
-    CONVERT_PRECISION_FROM_PHI(BOOL)
-    default:
-      return PrecisionType::UNK;
-  }
-#undef CONVERT_PRECISION_FROM_PHI
-}
-
-phi::DataLayout cvtLayout2Phi(LayoutType layout) {
-  switch (layout) {
-    case LayoutType::NCHW:
-      return phi::DataLayout::NCHW;
-    case LayoutType::NHWC:
-      return phi::DataLayout::NHWC;
-    case LayoutType::ANY:
-      return phi::DataLayout::ANY;
-    default:
-      return phi::DataLayout::UNDEFINED;
-  }
-}
-
-LayoutType cvtLayoutFromPhi(phi::DataLayout layout) {
-  switch (layout) {
-    case phi::DataLayout::NCHW:
-      return LayoutType::NCHW;
-    case phi::DataLayout::NHWC:
-      return LayoutType::NHWC;
-    case phi::DataLayout::ANY:
-      return LayoutType::ANY;
-    default:
-      return LayoutType::UNK;
-  }
-}
-
-phi::KernelKey cvtPlace2Phi(const Place& place) {
-  return phi::KernelKey(cvtTarget2Phi(place.target),
-                        cvtLayout2Phi(place.layout),
-                        cvtPrecision2Phi(place.precision));
-}
-
-Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg) {
-  return Place(cvtTargetFromPhi(tensor_arg.backend),
-               cvtPrecisionFromPhi(tensor_arg.dtype),
-               cvtLayoutFromPhi(tensor_arg.layout));
-}
-
-}  // namespace
 
 std::string getPhiTargetPrefix(TargetType target) {
   switch (target) {
@@ -182,14 +73,14 @@ std::string getPhiLayoutSuffix(LayoutType layout) {
   }
 }
 
-std::vector<PhiKernelDesc> getCandidateKernels(
+std::vector<PhiKernelDesc> GetCandidateKernels(
     std::string name, const std::vector<Place>& valid_palces) {
   std::vector<PhiKernelDesc> candidate_kernels;
   PhiKernelDesc phi_kernel_desc;
   phi::KernelKeyMap kernel_key_map =
       phi::KernelFactory::Instance().SelectKernelMap(name);
   for (Place place : valid_palces) {
-    phi::KernelKey kernel_key = cvtPlace2Phi(place);
+    phi::KernelKey kernel_key = ConvertPlaceToPhi(place);
     if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) {
       kernel_key = phi::KernelKey(kernel_key.backend(),
                                   phi::DataLayout::ALL_LAYOUT,
@@ -197,19 +88,20 @@ std::vector<PhiKernelDesc> getCandidateKernels(
       if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) continue;
       place.layout = LayoutType::ANY;
     }
-    phi_kernel_desc.kernelType = place;
-    phi_kernel_desc.inputsType.clear();
-    phi_kernel_desc.outputsType.clear();
+    phi_kernel_desc.kernel_type = place;
+    phi_kernel_desc.input_types.clear();
+    phi_kernel_desc.output_types.clear();
     phi::KernelArgsDef args_def = kernel_key_map.at(kernel_key).args_def();
     const paddle::SmallVector<phi::TensorArgDef>& input_arg =
         args_def.input_defs();
     const paddle::SmallVector<phi::TensorArgDef>& output_arg =
         args_def.output_defs();
     for (auto tensor_arg : input_arg) {
-      phi_kernel_desc.inputsType.emplace_back(cvtPlaceFromPhi(tensor_arg));
+      phi_kernel_desc.input_types.emplace_back(ConvertPlaceFromPhi(tensor_arg));
     }
     for (auto tensor_arg : output_arg) {
-      phi_kernel_desc.outputsType.emplace_back(cvtPlaceFromPhi(tensor_arg));
+      phi_kernel_desc.output_types.emplace_back(
+          ConvertPlaceFromPhi(tensor_arg));
     }
     candidate_kernels.emplace_back(phi_kernel_desc);
   }
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
index 34fd2f0f62dcd..cdc8f7cbff553 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
@@ -16,21 +16,21 @@
 
 #include <string>
 #include <vector>
-#include "paddle/infrt/dialect/infrt/common_type.h"
+#include "paddle/infrt/dialect/infrt/common/types.h"
 
 namespace infrt {
 
 struct PhiKernelDesc {
-  std::vector<Place> inputsType;   // kernel input place
-  std::vector<Place> outputsType;  // kernel output place
-  Place kernelType;                // kernel place
+  std::vector<Place> input_types;   // kernel input place
+  std::vector<Place> output_types;  // kernel output place
+  Place kernel_type;                // kernel place
 };
 
 std::string getPhiTargetPrefix(TargetType target);
 std::string getPhiPrecisionSuffix(PrecisionType precision);
 std::string getPhiLayoutSuffix(LayoutType layout);
 
-std::vector<PhiKernelDesc> getCandidateKernels(
+std::vector<PhiKernelDesc> GetCandidateKernels(
     std::string name, const std::vector<Place>& valid_palces);
 
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc
new file mode 100644
index 0000000000000..bd5f0799a60d5
--- /dev/null
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <vector>
+
+#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
+#include "paddle/phi/kernels/declarations.h"
+
+namespace infrt {
+
+TEST(phi, get_op_desc) {
+  std::vector<Place> places;
+  places.emplace_back(
+      TargetType::CPU, PrecisionType::FLOAT32, LayoutType::NCHW);
+  auto kernels = GetCandidateKernels("addmm", places);
+  ASSERT_GE(kernels.size(), 1UL);
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
new file mode 100644
index 0000000000000..13cba6eeabb66
--- /dev/null
+++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
@@ -0,0 +1,268 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
+
+#include <glog/logging.h>
+#include <llvm/ADT/SetVector.h>
+#include <mlir/Analysis/SliceAnalysis.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/OperationSupport.h>
+#include <list>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/infrt/common/string.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
+#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
+#include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h"
+#include "paddle/phi/core/compat/op_utils.h"
+#include "paddle/phi/core/kernel_factory.h"
+#include "paddle/phi/ops/compat/signatures.h"
+
+namespace {
+class PhiOpConvertPass
+    : public mlir::PassWrapper<PhiOpConvertPass, mlir::FunctionPass> {
+ public:
+  ::llvm::StringRef getName() const override { return "PhiOpConvertPass"; }
+  void runOnFunction() override;
+  PhiOpConvertPass();
+  explicit PhiOpConvertPass(const std::vector<infrt::Place> &valid_places)
+      : valid_places_(valid_places) {}
+
+  PhiOpConvertPass(const PhiOpConvertPass &other)
+      : mlir::PassWrapper<PhiOpConvertPass, mlir::FunctionPass>(*this),
+        valid_places_(other.valid_places_) {}
+
+  ::llvm::StringRef getArgument() const override { return "phi-op-convert"; }
+  void getDependentDialects(mlir::DialectRegistry &registry) const override;
+
+ private:
+  void convertStage();
+  void dispatchStage();
+
+  // Force a specified data format for all layout sensitive operations.
+  Option<std::string> valid_places_options_{
+      *this,
+      "valid-targets",
+      llvm::cl::desc("Set the valid target, [CPU-FP32-NCHW]")};
+
+  std::vector<infrt::Place> valid_places_;
+};
+// Implementation of the PhiOpConvertPass.
+void PhiOpConvertPass::runOnFunction() {
+  convertStage();
+  dispatchStage();
+}
+
+void PhiOpConvertPass::convertStage() {
+  mlir::Block &body = getFunction().front();
+  std::vector<mlir::Operation *> worklist;
+  for (auto &op : body.without_terminator()) {
+    worklist.push_back(&op);
+  }
+  mlir::OpBuilder builder(&body, body.begin());
+  while (!worklist.empty()) {
+    auto *op = worklist.back();
+    worklist.pop_back();
+    if (!op) continue;
+
+    auto op_name = op->getName().getIdentifier().str();
+
+    // only convert op in pd dialect.
+    if (op_name.substr(0, 3) != "pd.") continue;
+    op_name = op_name.substr(3);
+    if (pd_dialect_inputs_info_map_.find(op_name) ==
+            pd_dialect_inputs_info_map_.end() ||
+        pd_dialect_outputs_info_map_.find(op_name) ==
+            pd_dialect_outputs_info_map_.end()) {
+      LOG(WARNING) << "No op info found for " << op_name;
+      // Todo: print log
+      continue;
+    }
+    auto loc = getFunction().getLoc();
+    builder.setInsertionPoint(op);
+    if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_name)) {
+      std::string kernel_name = phi::TransToPhiKernelName(op_name);
+      auto kernel_op = builder.create<infrt::KernelOp>(loc,
+                                                       op->getResultTypes(),
+                                                       op->getOperands(),
+                                                       kernel_name,
+                                                       op->getAttrDictionary());
+      op->replaceAllUsesWith(kernel_op.getResults());
+    } else {
+      ::phi::KernelSignature kernel_sign =
+          ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)(
+              infrt::ProtoArgumentMappingContext(op));
+      // resort input&output according to kernel_sign
+      ::llvm::SmallVector<mlir::Value, 4> inputs, ori_output;
+      ::llvm::SmallVector<mlir::Type, 4> output_types;
+      for (const std::string &str : std::get<0>(kernel_sign.args)) {
+        if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) {
+          LOG(ERROR) << "No input info for Op " << op_name << " and argument "
+                     << str;
+          return;
+        }
+        uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str);
+        inputs.push_back(op->getOperands()[index]);
+      }
+
+      for (const std::string &str : std::get<2>(kernel_sign.args)) {
+        if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) {
+          LOG(ERROR) << "No output info for Op " << op_name << " and argument "
+                     << str;
+          return;
+        }
+        uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str);
+        output_types.push_back(op->getResultTypes()[index]);
+        ori_output.push_back(op->getResult(index));
+      }
+      auto kernel_op = builder.create<infrt::KernelOp>(
+          loc, output_types, inputs, kernel_sign.name, op->getAttrDictionary());
+      for (size_t index = 0; index < ori_output.size(); ++index) {
+        ori_output[index].replaceAllUsesWith(kernel_op.getResult(index));
+      }
+    }
+    CHECK(op->use_empty());
+    op->erase();
+  }
+}
+
+void PhiOpConvertPass::dispatchStage() {
+  std::vector<infrt::KernelOp> worklist;
+  mlir::Block &block = getFunction().front();
+  for (auto &op : block) {
+    infrt::KernelOp kernel_op = ::llvm::dyn_cast_or_null<infrt::KernelOp>(&op);
+    if (nullptr != kernel_op) worklist.push_back(kernel_op);
+  }
+
+  mlir::OpBuilder builder(&block, block.begin());
+  std::map<infrt::TargetType, mlir::Value> phi_context;
+  for (infrt::KernelOp kernel_op : worklist) {
+    std::string kernel_name = kernel_op.name().str();
+    std::vector<infrt::PhiKernelDesc> candidates =
+        GetCandidateKernels(kernel_name, valid_places_);
+    if (candidates.empty()) {
+      LOG(FATAL) << "No candidate kernels for op:" << kernel_name;
+      continue;
+    }
+    builder.setInsertionPoint(kernel_op);
+
+    // Todo: Implimentation the concrete pass pick strategy
+    const infrt::PhiKernelDesc &phi_kernel_desc = candidates.front();
+
+    kernel_name =
+        infrt::getPhiTargetPrefix(phi_kernel_desc.kernel_type.target) +
+        kernel_name +
+        infrt::getPhiPrecisionSuffix(phi_kernel_desc.kernel_type.precision) +
+        infrt::getPhiLayoutSuffix(phi_kernel_desc.kernel_type.layout);
+
+    mlir::OperationName operation_name(kernel_name, kernel_op.getContext());
+    mlir::OperationState operation_state(kernel_op.getLoc(), operation_name);
+
+    if (phi_context.find(phi_kernel_desc.kernel_type.target) ==
+        phi_context.end()) {
+      switch (phi_kernel_desc.kernel_type.target) {
+        case infrt::TargetType::CPU: {
+          auto context_value =
+              builder
+                  .create<infrt::phi::CreateCPUContextOp>(
+                      kernel_op.getLoc(),
+                      infrt::phi::ContextType::get(kernel_op.getContext(),
+                                                   infrt::TargetType::CPU))
+                  .output();
+          phi_context[infrt::TargetType::CPU] = context_value;
+        } break;
+        case infrt::TargetType::GPU:
+        case infrt::TargetType::UNK:
+        default:
+          LOG(FATAL) << "Unsupported TargetType";
+          break;
+      }
+    }
+    operation_state.addOperands(
+        phi_context.at(phi_kernel_desc.kernel_type.target));
+
+    for (size_t index = 0; index < phi_kernel_desc.input_types.size();
+         ++index) {
+      mlir::Value input = kernel_op.getOperand(index);
+      auto cvt_tensor_type_op = builder.create<infrt::TensorCastOp>(
+          kernel_op.getLoc(),
+          infrt::DenseTensorType::get(
+              kernel_op.getContext(),
+              phi_kernel_desc.input_types[index].target,
+              phi_kernel_desc.input_types[index].precision,
+              phi_kernel_desc.input_types[index].layout),
+          input);
+      operation_state.addOperands(cvt_tensor_type_op.output());
+    }
+
+    for (size_t index = 0; index < phi_kernel_desc.output_types.size();
+         ++index) {
+      operation_state.addTypes(infrt::DenseTensorType::get(
+          kernel_op.getContext(),
+          phi_kernel_desc.output_types[index].target,
+          phi_kernel_desc.output_types[index].precision,
+          phi_kernel_desc.output_types[index].layout));
+    }
+    operation_state.addAttributes(kernel_op.attrsAttr().getValue());
+    mlir::Operation *phi_operation = builder.createOperation(operation_state);
+    for (size_t index = 0; index < phi_kernel_desc.output_types.size();
+         ++index) {
+      mlir::Value input = phi_operation->getResult(index);
+      auto cvt_tensor_type_op = builder.create<infrt::TensorCastOp>(
+          kernel_op.getLoc(), kernel_op.getResultTypes()[index], input);
+      kernel_op.getResult(index).replaceAllUsesWith(
+          cvt_tensor_type_op.output());
+    }
+    kernel_op.erase();
+  }
+}
+
+PhiOpConvertPass::PhiOpConvertPass() {
+  if (!valid_places_options_.hasValue()) {
+    valid_places_.emplace_back(infrt::TargetType::CPU,
+                               infrt::PrecisionType::FLOAT32,
+                               infrt::LayoutType::NCHW);
+    return;
+  }
+
+  LOG(FATAL) << "To be done for specifying places in command line";
+}
+
+void PhiOpConvertPass::getDependentDialects(
+    mlir::DialectRegistry &registry) const {
+  registry.insert<infrt::InfrtDialect>();
+  registry.insert<infrt::phi::PHIDialect>();
+  registry.insert<infrt::phi::PHIDenseTensorDialect>();
+  registry.insert<infrt::phi::PHICPUKernelDialect>();
+  registry.insert<infrt::phi::PHIGPUKernelDialect>();
+}
+
+}  // namespace
+
+mlir::PassRegistration<PhiOpConvertPass> phi_op_convert;
+
+std::unique_ptr<mlir::Pass> infrt::createPhiOpCvtPass(
+    std::vector<Place> valid_places) {
+  return std::make_unique<PhiOpConvertPass>(valid_places);
+}
+
+std::unique_ptr<mlir::Pass> infrt::createPhiOpCvtPass() {
+  return std::make_unique<PhiOpConvertPass>();
+}
diff --git a/paddle/infrt/dialect/pd_types.cc b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h
similarity index 63%
rename from paddle/infrt/dialect/pd_types.cc
rename to paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h
index 94856e362d301..5a2c0ee96ed0d 100644
--- a/paddle/infrt/dialect/pd_types.cc
+++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h
@@ -12,4 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/pd_types.h"
+#pragma once
+#include <mlir/Pass/Pass.h>
+#include "paddle/infrt/dialect/infrt/common/types.h"
+
+namespace infrt {
+/*
+ * phiOpCvtPass.
+ * Convert the general operators from pd Dialect to phi dialect.
+ */
+std::unique_ptr<mlir::Pass> createPhiOpCvtPass(std::vector<Place> valid_places);
+
+std::unique_ptr<mlir::Pass> createPhiOpCvtPass();
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
deleted file mode 100644
index 4347ec19e8166..0000000000000
--- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
+++ /dev/null
@@ -1,196 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
-
-#include <glog/logging.h>
-#include <llvm/ADT/SetVector.h>
-#include <mlir/Analysis/SliceAnalysis.h>
-#include <mlir/IR/Builders.h>
-#include <mlir/IR/Operation.h>
-#include <mlir/IR/OperationSupport.h>
-#include <list>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
-#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
-#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
-#include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h"
-#include "paddle/phi/core/compat/op_utils.h"
-#include "paddle/phi/ops/compat/signatures.h"
-namespace infrt {
-// Implementation of the phiOpCvtPass.
-void phiOpCvtPass::runOnFunction() {
-  convertStage();
-  diapatchStage();
-}
-void phiOpCvtPass::convertStage() {
-  mlir::Block &body = getFunction().front();
-  std::vector<mlir::Operation *> worklist;
-  for (auto &op : body.without_terminator()) {
-    worklist.push_back(&op);
-  }
-  mlir::OpBuilder builder(&body, body.begin());
-  while (!worklist.empty()) {
-    auto *op = worklist.back();
-    worklist.pop_back();
-    if (op == nullptr) continue;
-
-    std::string op_name = op->getName().getIdentifier().str();
-
-    // only convert op in pd dialect.
-    if (op_name.substr(0, 3) != "pd.") continue;
-    op_name = op_name.substr(3);
-    if (pd_dialect_inputs_info_map_.find(op_name) ==
-            pd_dialect_inputs_info_map_.end() ||
-        pd_dialect_outputs_info_map_.find(op_name) ==
-            pd_dialect_outputs_info_map_.end()) {
-      // Todo: print log
-      continue;
-    }
-
-    ::phi::KernelSignature kernel_sign =
-        ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)(
-            ProtoArgumentMappingContext(op));
-    // resort input&output according to kernel_sign
-    ::llvm::SmallVector<mlir::Value, 4> inputs, ori_output;
-    ::llvm::SmallVector<mlir::Type, 4> output_types;
-    for (const std::string &str : std::get<0>(kernel_sign.args)) {
-      if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) {
-        // Todo: print error log
-        return;
-      }
-      uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str);
-      inputs.push_back(op->getOperands()[index]);
-    }
-
-    for (const std::string &str : std::get<2>(kernel_sign.args)) {
-      if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) {
-        // Todo: print error log
-        return;
-      }
-      uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str);
-      output_types.push_back(op->getResultTypes()[index]);
-      ori_output.push_back(op->getResult(index));
-    }
-
-    auto loc = getFunction().getLoc();
-    builder.setInsertionPoint(op);
-    auto kernel_op = builder.create<infrt::KernelOp>(
-        loc, output_types, inputs, kernel_sign.name, op->getAttrDictionary());
-    for (size_t index = 0; index < ori_output.size(); ++index) {
-      ori_output[index].replaceAllUsesWith(kernel_op.getResult(index));
-    }
-    if (!op->use_empty()) {
-      // Todo: print error log
-      return;
-    }
-    op->erase();
-  }
-}
-void phiOpCvtPass::diapatchStage() {
-  std::vector<infrt::KernelOp> worklist;
-  mlir::Block &block = getFunction().front();
-  for (auto &op : block) {
-    infrt::KernelOp kernel_op = ::llvm::dyn_cast_or_null<infrt::KernelOp>(&op);
-    if (nullptr != kernel_op) worklist.push_back(kernel_op);
-  }
-
-  mlir::OpBuilder builder(&block, block.begin());
-  std::map<TargetType, mlir::Value> phi_context;
-  for (infrt::KernelOp kernel_op : worklist) {
-    std::string kernel_name = kernel_op.name().str();
-    std::vector<PhiKernelDesc> candidates =
-        getCandidateKernels(kernel_name, valid_places_);
-    if (candidates.empty()) {
-      LOG(FATAL) << "No candidate kernels for op:" << kernel_name;
-      continue;
-    }
-    builder.setInsertionPoint(kernel_op);
-
-    // Todo: Implimentation the concrete pass pick strategy
-    const PhiKernelDesc &phi_kernel_desc = candidates.front();
-
-    kernel_name = getPhiTargetPrefix(phi_kernel_desc.kernelType.target) +
-                  kernel_name +
-                  getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision) +
-                  getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout);
-
-    mlir::OperationName operation_name(kernel_name, kernel_op.getContext());
-    mlir::OperationState operation_state(kernel_op.getLoc(), operation_name);
-
-    if (phi_context.find(phi_kernel_desc.kernelType.target) ==
-        phi_context.end()) {
-      switch (phi_kernel_desc.kernelType.target) {
-        case TargetType::CPU: {
-          auto alloctor_value =
-              builder
-                  .create<infrt::phi::CreateAllocatorOp_cpu>(
-                      kernel_op.getLoc(),
-                      phi::AllocatorType::get(kernel_op.getContext(),
-                                              TargetType::CPU))
-                  .output();
-          auto context_value =
-              builder
-                  .create<infrt::phi::CreateContextOp_cpu>(
-                      kernel_op.getLoc(),
-                      phi::ContextType::get(kernel_op.getContext(),
-                                            TargetType::CPU),
-                      alloctor_value)
-                  .output();
-          phi_context[TargetType::CPU] = context_value;
-        } break;
-        case TargetType::GPU:
-        case TargetType::UNK:
-        default:
-          LOG(FATAL) << "Unsupported TargetType";
-          break;
-      }
-    }
-    operation_state.addOperands(
-        phi_context.at(phi_kernel_desc.kernelType.target));
-    for (size_t index = 0; index < phi_kernel_desc.inputsType.size(); ++index) {
-      mlir::Value input = kernel_op.getOperand(index);
-      auto cvt_tensor_type_op = builder.create<CvtTensorOp>(
-          kernel_op.getLoc(),
-          DenseTensorType::get(kernel_op.getContext(),
-                               phi_kernel_desc.inputsType[index].target,
-                               phi_kernel_desc.inputsType[index].precision,
-                               phi_kernel_desc.inputsType[index].layout),
-          input);
-      operation_state.addOperands(cvt_tensor_type_op.output());
-    }
-    for (size_t index = 0; index < phi_kernel_desc.outputsType.size();
-         ++index) {
-      operation_state.addTypes(
-          DenseTensorType::get(kernel_op.getContext(),
-                               phi_kernel_desc.outputsType[index].target,
-                               phi_kernel_desc.outputsType[index].precision,
-                               phi_kernel_desc.outputsType[index].layout));
-    }
-    operation_state.addAttributes(kernel_op.attrsAttr().getValue());
-    mlir::Operation *phi_operation = builder.createOperation(operation_state);
-    for (size_t index = 0; index < phi_kernel_desc.outputsType.size();
-         ++index) {
-      mlir::Value input = phi_operation->getResult(index);
-      auto cvt_tensor_type_op = builder.create<CvtTensorOp>(
-          kernel_op.getLoc(), kernel_op.getResultTypes()[index], input);
-      kernel_op.getResult(index).replaceAllUsesWith(
-          cvt_tensor_type_op.output());
-    }
-    kernel_op.erase();
-  }
-}
-}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h
deleted file mode 100644
index 051fee9b61a24..0000000000000
--- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <mlir/Pass/Pass.h>
-#include "paddle/infrt/dialect/infrt/common_type.h"
-
-namespace infrt {
-/*
- * phiOpCvtPass.
- *
- * Convert the general operators in pd Dialect to a infrt.kernelOp.
- *
- * source func:
- *
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %c = "pd.conv2d"(%a) ...
- *  %d = "pd.conv3d"(%c) ...
- *  %f = "pd.conv2d"(%a) ...
- *  "pd.fetch" (%d, %f)
- * }
- *
- * destination func:
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %c = "infrt.kernel"(%a){name = "conv2d"} ...
- *  %d = "infrt.kernel"(%c){name = "conv3d"}...
- *  %f = "infrt.kernel"(%a){name = "conv2d"}...
- *  "pd.fetch" (%d, %f)
- * }
- */
-class phiOpCvtPass
-    : public mlir::PassWrapper<phiOpCvtPass, mlir::FunctionPass> {
- public:
-  ::llvm::StringRef getName() const override { return "phiOpCvtPass"; }
-  void runOnFunction() override;
-  explicit phiOpCvtPass(std::vector<Place> valid_places = std::vector<Place>())
-      : valid_places_(valid_places) {}
-
- private:
-  void convertStage();
-  void diapatchStage();
-  std::vector<Place> valid_places_;
-};
-}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
index 64b184359700e..1cd5b5a85511f 100644
--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
@@ -60,6 +60,10 @@ bool ProtoArgumentMappingContext::IsSelectedRowsInput(
     const std::string& name) const {
   return false;
 }
+bool ProtoArgumentMappingContext::IsDenseTensorVectorInput(
+    const std::string& name) const {
+  return false;
+}
 
 bool ProtoArgumentMappingContext::IsDenseTensorOutput(
     const std::string& name) const {
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
index e4e9b5c3ff8a1..5cf2ef979076d 100644
--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <mlir/IR/Operation.h>
 #include <unordered_map>
-#include "paddle/infrt/dialect/pd_ops_info.h"
+#include "paddle/infrt/dialect/pd/common/pd_ops_info.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
 
 namespace infrt {
@@ -42,6 +42,7 @@ class ProtoArgumentMappingContext : public ::phi::ArgumentMappingContext {
 
   bool IsDenseTensorInput(const std::string& name) const override;
   bool IsSelectedRowsInput(const std::string& name) const override;
+  bool IsDenseTensorVectorInput(const std::string& name) const override;
 
   bool IsDenseTensorOutput(const std::string& name) const override;
   bool IsSelectedRowsOutput(const std::string& name) const override;
diff --git a/paddle/infrt/dialect/phi/phi_ir_exec.cc b/paddle/infrt/dialect/phi/phi_ir_exec.cc
index 559fb90a64a78..0beb5bff29f6d 100644
--- a/paddle/infrt/dialect/phi/phi_ir_exec.cc
+++ b/paddle/infrt/dialect/phi/phi_ir_exec.cc
@@ -18,7 +18,7 @@
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
-#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
+#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
 
 int main(int argc, char** argv) {
   static llvm::cl::opt<std::string> input_file(
@@ -38,7 +38,7 @@ int main(int argc, char** argv) {
   std::vector<infrt::Place> valid_places = {{infrt::TargetType::CPU,
                                              infrt::PrecisionType::FLOAT32,
                                              infrt::LayoutType::NCHW}};
-  phi_pass_manager.addPass(std::make_unique<infrt::phiOpCvtPass>(valid_places));
+  phi_pass_manager.addPass(infrt::createPhiOpCvtPass(valid_places));
   phi_pass_manager.addPass(infrt::createInfrtOpFusePass());
   if (mlir::failed(pm.run(*module))) {
     std::cout << "\npass failed!\n" << std::endl;
diff --git a/paddle/infrt/dialect/print_ir.cc b/paddle/infrt/dialect/print_ir.cc
index a37df265955e7..b118a5f7a9caf 100644
--- a/paddle/infrt/dialect/print_ir.cc
+++ b/paddle/infrt/dialect/print_ir.cc
@@ -31,7 +31,7 @@
 #include <iostream>
 
 #include "paddle/infrt/common/global.h"
-#include "paddle/infrt/dialect/init_infrt_dialects.h"
+#include "paddle/infrt/dialect/init_dialects.h"
 
 namespace cl = llvm::cl;
 
diff --git a/paddle/infrt/dialect/tensor_shape.td b/paddle/infrt/dialect/tensor_shape.td
index d3714c8ed14d3..2be21d6aa7720 100644
--- a/paddle/infrt/dialect/tensor_shape.td
+++ b/paddle/infrt/dialect/tensor_shape.td
@@ -2,7 +2,7 @@
 #else
 #define INFRT_OPS
 
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "paddle/infrt/dialect/tensor_shape_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
diff --git a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
index 701391a750354..6467c1285f85e 100644
--- a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
+++ b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
@@ -2,12 +2,12 @@
 #define PD_LOWER_TO_TRT
 
 include "mlir/Interfaces/SideEffectInterfaces.td"
-include "paddle/infrt/dialect/infrt_base.td"
-include "paddle/infrt/dialect/pd_ops.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
+include "paddle/infrt/dialect/pd/ir/pd_ops.td"
 include "paddle/infrt/dialect/tensorrt/trt_ops.td"
 
 def PD2TRT_Matmul_Lower : Pat<
-        (PD_MatmulOp $X, $Y, $transpose_X, $transpose_Y, ConstantAttr<F32Attr, "1.0">, ConstantAttr<SI32Attr, "1">),
+        (PD_MatmulOp $X, $Y, $transpose_X, $transpose_Y, ConstantAttr<F32Attr, "1.0">),
         (TRT_MatrixMultiplyOp $X, $transpose_X, $Y, $transpose_Y)>;
 
 //TO DO(shangzhizhou):replace '"INFRT_createI32Attr<"0">' to enum nvinfer1::ElementWiseOperation::kSUM
diff --git a/paddle/infrt/dialect/tensorrt/trt_dilaect_types.h b/paddle/infrt/dialect/tensorrt/trt_dialect_types.h
similarity index 91%
rename from paddle/infrt/dialect/tensorrt/trt_dilaect_types.h
rename to paddle/infrt/dialect/tensorrt/trt_dialect_types.h
index efcf7dd5be195..0c3edcec1edb2 100644
--- a/paddle/infrt/dialect/tensorrt/trt_dilaect_types.h
+++ b/paddle/infrt/dialect/tensorrt/trt_dialect_types.h
@@ -23,6 +23,8 @@ class EngineType
     : public mlir::Type::TypeBase<EngineType, mlir::Type, mlir::TypeStorage> {
  public:
   using Base::Base;
+  static EngineType get();
+  static EngineType get(mlir::MLIRContext *context);
 };
 
 }  // namespace trt
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
index fa0095363c5fd..0878163a955af 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
@@ -17,11 +17,12 @@
 #include <llvm/ADT/SetVector.h>
 #include <mlir/Analysis/SliceAnalysis.h>
 #include <mlir/IR/Builders.h>
-#include <paddle/infrt/dialect/pd_ops.h>
 #include <list>
 #include <unordered_set>
 #include <vector>
 
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
+
 namespace infrt {
 namespace trt {
 namespace {
@@ -53,9 +54,9 @@ bool reverseDfs(std::vector<mlir::Operation *> source,
 }
 
 // merge the first&second graph op to a new graph op.
-void mergeTwoAdjacentCreateEngineOp(mlir::OpBuilder &builder,  // NOLINT
-                                    CreateEngineOp first,
-                                    CreateEngineOp second) {
+void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder,  // NOLINT
+                             infrt::pd::GraphOp first,
+                             infrt::pd::GraphOp second) {
   // comput inputs and outputs
   ::llvm::SmallVector<mlir::Value, 4> inputs(first.getOperands()), outputs;
   for (mlir::Value input : second.getOperands()) {
@@ -84,8 +85,7 @@ void mergeTwoAdjacentCreateEngineOp(mlir::OpBuilder &builder,  // NOLINT
   // create the new graph op
   builder.setInsertionPoint(first);
   auto loc = first.getLoc();
-  auto graph_op =
-      builder.create<CreateEngineOp>(loc, return_types, inputs, true);
+  auto graph_op = builder.create<infrt::pd::GraphOp>(loc, return_types, inputs);
   mlir::Block *block = new mlir::Block;
   auto copy_range = second.getBody()->without_terminator();
   block->getOperations().splice(block->begin(),
@@ -98,7 +98,7 @@ void mergeTwoAdjacentCreateEngineOp(mlir::OpBuilder &builder,  // NOLINT
                                 copy_range.begin(),
                                 copy_range.end());
   builder.setInsertionPointToEnd(block);
-  builder.create<::infrt::dialect::ReturnOp>(loc, outputs);
+  builder.create<::infrt::ReturnOp>(loc, outputs);
   graph_op.body().push_back(block);
 
   // mapping the output
@@ -150,12 +150,13 @@ void TRTGraphFusePass::runOnFunction() {
   do {
     changed = false;
     for (auto &op : body) {
-      CreateEngineOp graph_op = ::llvm::dyn_cast_or_null<CreateEngineOp>(&op);
+      infrt::pd::GraphOp graph_op =
+          ::llvm::dyn_cast_or_null<infrt::pd::GraphOp>(&op);
       if (nullptr == graph_op) continue;
 
       for (auto user_op : op.getUsers()) {
-        CreateEngineOp user_graph_op =
-            ::llvm::dyn_cast_or_null<CreateEngineOp>(user_op);
+        infrt::pd::GraphOp user_graph_op =
+            ::llvm::dyn_cast_or_null<infrt::pd::GraphOp>(user_op);
         if (nullptr == user_graph_op) continue;
         // get all dst input nodes except src.
         std::vector<mlir::Operation *> source_nodes;
@@ -168,7 +169,7 @@ void TRTGraphFusePass::runOnFunction() {
         // Reverse DFS from the source_nodes.
         if (!reverseDfs(source_nodes,
                         [&op](const mlir::Operation *n) { return n == &op; })) {
-          mergeTwoAdjacentCreateEngineOp(builder, graph_op, user_graph_op);
+          mergeTwoAdjacentGraphOp(builder, graph_op, user_graph_op);
           changed = true;
           break;
         }
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
index 350add905aac7..18afba19e0618 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
@@ -14,8 +14,6 @@
 
 #pragma once
 #include <mlir/Pass/Pass.h>
-#include "paddle/infrt/dialect/infrt_base.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -26,40 +24,37 @@ namespace trt {
  *
  * source func:
  *
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %c = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *  %c = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
- *     "Infrt.return" %m
+ *     infrt.return %m...
  *  } ...
- *  %d = "trt.create_engine"(%c) {
+ *  %d = "pd.graph"(%c) {
  *      %m = "pd.conv3d"(%c)...
- *      "Infrt.return" %m
+ *      infrt.return %m...
  *  } ...
- *  %f = "trt.create_engine"(%a) {
+ *  %f = "pd.graph"(%a) {
  *      %m = "pd.conv2d"(%a)...
- *      "Infrt.return" %m
+ *      infrt.return %m...
  *  } ...
- *  "pd.fetch" %d, %f
+ *  infrt.return %d, %f :...
+ * }
  *
  * destination func:
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %d, %f = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *  %d, %f = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "Infrt.return" %n, %s
+ *     infrt.return %n, %s:...
  *  } ...
- *  "pd.fetch" %d, %f
+ *  infrt.return %d, %f:...
  * }
  */
 class TRTGraphFusePass
     : public mlir::PassWrapper<TRTGraphFusePass, mlir::FunctionPass> {
  public:
-  void getDependentDialects(mlir::DialectRegistry &registry) const override {
-    registry.insert<TensorRTDialect, ::infrt::dialect::INFRTDialect>();
-  }
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {}
   ::llvm::StringRef getName() const override { return "trtGraphFusePass"; }
   void runOnFunction() override;
 };
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
index 5ee7b23213a01..ade61bfc370f5 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
@@ -15,24 +15,24 @@
 #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h"
 
 #include <mlir/IR/Builders.h>
-#include "paddle/infrt/dialect/pd_ops.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 
 namespace infrt {
 namespace trt {
 // Implementation of the trtGraphSplitPass。
 void TRTGraphSplitPass::runOnFunction() {
-  std::vector<CreateEngineOp> worklist;
+  std::vector<infrt::pd::GraphOp> worklist;
   mlir::Block& block = getFunction().front();
   for (auto& op : block) {
-    CreateEngineOp graph_op = ::llvm::dyn_cast_or_null<CreateEngineOp>(&op);
+    infrt::pd::GraphOp graph_op =
+        ::llvm::dyn_cast_or_null<infrt::pd::GraphOp>(&op);
     if (nullptr != graph_op &&
         graph_op.getBody()->getOperations().size() <= min_subgraph_size_) {
       worklist.push_back(graph_op);
     }
   }
   while (!worklist.empty()) {
-    CreateEngineOp graph_op = worklist.back();
+    infrt::pd::GraphOp graph_op = worklist.back();
     worklist.pop_back();
     mlir::Block* body = graph_op.getBody();
     auto return_op = body->getTerminator();
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
index 28078e2bc2dbf..a5dd4f14b2946 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
@@ -14,8 +14,6 @@
 
 #pragma once
 #include <mlir/Pass/Pass.h>
-#include "paddle/infrt/dialect/infrt_base.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -27,33 +25,29 @@ namespace trt {
  *
  * source func:
  *
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %d, %f = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *  %d, %f = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "Infrt.return" (%n, %s)
+ *     infrt.return %n, %s : ...
  *  } ...
- *  "pd.fetch" (%d, %f)
+ *  infrt.return %d, %f : ...
  * }
  *
  * destination func:
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
  *  %c = "pd.conv2d"(%a) ...
  *  %d = "pd.conv3d"(%c) ...
  *  %f = "pd.conv2d"(%a) ...
- *  "pd.fetch" (%d, %f)
+ *  infrt.return %d, %f:...
  * }
  */
 class TRTGraphSplitPass
     : public mlir::PassWrapper<TRTGraphSplitPass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtGraphSplitPass"; }
-  void getDependentDialects(mlir::DialectRegistry &registry) const override {
-    registry.insert<TensorRTDialect, ::infrt::dialect::INFRTDialect>();
-  }
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {}
   void runOnFunction() override;
   explicit TRTGraphSplitPass(size_t min_subgraph_size = 3)
       : min_subgraph_size_(min_subgraph_size) {}
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
index 8d81e739d9c72..19c6b13e971ec 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
@@ -14,14 +14,65 @@
 #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
 #include <mlir/IR/Builders.h>
 #include <mlir/Transforms/DialectConversion.h>
-#include "paddle/infrt/dialect/infrt_base.h"
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
+#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
 
 namespace infrt {
 namespace trt {
 
 #include "paddle/infrt/dialect/tensorrt/pd_lower_to_trt.cpp.inc"  // NOLINT
 
+struct PD2TRT_GraphLower : public ::mlir::RewritePattern {
+  explicit PD2TRT_GraphLower(::mlir::MLIRContext *context)
+      : ::mlir::RewritePattern("pd.graph", 1, context, {"trt.create_engine"}) {}
+  ::mlir::LogicalResult matchAndRewrite(
+      ::mlir::Operation *op, ::mlir::PatternRewriter &rewriter) const override {
+    auto casted_op = ::llvm::dyn_cast<infrt::pd::GraphOp>(op);
+    ::mlir::Operation::operand_range inputs = casted_op.inputs();
+    auto ods_loc = rewriter.getFusedLoc(op->getLoc());
+    CreateEngineOp create_engine_op;
+    // inputs
+    ::mlir::SmallVector<::mlir::Value, 4> trt_inputs;
+    for (auto v : inputs) {
+      trt_inputs.push_back(v);
+    }
+    create_engine_op = rewriter.create<CreateEngineOp>(
+        ods_loc,
+        ::llvm::SmallVector<mlir::Type, 4>(1, EngineType::get()),
+        trt_inputs,
+        true /*run_once*/);
+    ::mlir::Block *block = new ::mlir::Block;
+    block->getOperations().splice(block->begin(),
+                                  casted_op.getBody()->getOperations(),
+                                  casted_op.getBody()->begin(),
+                                  casted_op.getBody()->end());
+    create_engine_op.body().push_back(block);
+
+    // trt.execute
+    // outputs
+    ::llvm::SmallVector<::mlir::Type, 4> execute_outputs_types;
+    for (auto v : casted_op.getODSResults(0)) {
+      execute_outputs_types.push_back(v.getType());
+    }
+    // inputs
+    ::mlir::SmallVector<::mlir::Value, 4> execute_inputs(
+        create_engine_op.getODSResults(0));
+    for (auto v : inputs) {
+      execute_inputs.push_back(v);
+    }
+    auto execute_op = rewriter.create<ExecuteOp>(
+        ods_loc, execute_outputs_types, execute_inputs);
+
+    ::llvm::SmallVector<::mlir::Value, 4> replace_values;
+    for (auto v :
+         ::llvm::SmallVector<::mlir::Value, 4>{execute_op.getODSResults(0)}) {
+      replace_values.push_back(v);
+    }
+    rewriter.replaceOp(op, replace_values);
+    return ::mlir::success();
+  }
+};
+
 void TRTOpConverterPass::runOnOperation() {
   // The first thing to define is the conversion target. This will define the
   // final target for this lowering.
@@ -36,6 +87,7 @@ void TRTOpConverterPass::runOnOperation() {
   // the set of patterns that will lower the TensorRT operations.
   ::mlir::RewritePatternSet patterns(&getContext());
   populateWithGenerated(patterns);
+  patterns.add<PD2TRT_GraphLower>(&getContext());
 
   // With the target and rewrite patterns defined, we can now attempt the
   // conversion. The conversion will signal failure if any of our `illegal`
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
index a8128a585ee82..ede64f8bcd556 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 #include "mlir/IR/Dialect.h"
 #include "mlir/Pass/Pass.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
@@ -23,27 +24,26 @@ namespace trt {
  * trtOpConverterPass.
  *
  * source ir:
- * func @main() -> tensor<?xf32> {
- *   %a = "pd.feed"()...
- *   %d, %f = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *   %d, %f = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "Infrt.return" %n, %s
+ *     infrt.return %n, %s:...
  *   } ...
- *   "pd.fetch" %d, %f
+ *   infrt.return %d, %f:...
  * }
  *
  * destination ir:
- * func @main() -> tensor<?xf32> {
- *   %a = "pd.feed"()...
- *   %d, %f = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *   %engine = "trt.create_engine"(%a) ({
  *     %m = "trt.Convolution"(%a)...
  *     %n = "trt.Convolution"(%m)...
  *     %s = "trt.Convolution"(%a)...
- *     "Infrt.return" %n, %s
- *   } ...
- *   "pd.fetch" %d, %f
+ *     infrt.return %n, %s :...
+ *   }){run_once = true} ...
+ *   %d, %f = "trt.execute"(%engine, %a)...
+ *   infrt.return %d, %f :...
  * }
  */
 struct TRTOpConverterPass
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
index 17e893a383a9c..ef9ccc82678f4 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
@@ -15,8 +15,9 @@
 #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
 
 #include <mlir/IR/Builders.h>
-#include "paddle/infrt/dialect/basic_kernels.h"
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -34,14 +35,14 @@ void TRTOpTellerPass::runOnFunction() {
     auto *op = worklist.back();
     worklist.pop_back();
     if (op == nullptr) continue;
-    if (::llvm::dyn_cast_or_null<mlir::pd::FeedOp>(op)) continue;
-    if (::llvm::dyn_cast_or_null<mlir::pd::FetchOp>(op)) continue;
-    if (::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(op)) continue;
-    if (::llvm::dyn_cast_or_null<CreateEngineOp>(op)) continue;
+    if (::llvm::dyn_cast_or_null<infrt::pd::FeedOp>(op)) continue;
+    if (::llvm::dyn_cast_or_null<infrt::pd::FetchOp>(op)) continue;
+    if (::llvm::dyn_cast_or_null<infrt::pd::GraphOp>(op)) continue;
+    if (::llvm::dyn_cast_or_null<::infrt::ReturnOp>(op)) continue;
     builder.setInsertionPoint(op);
     auto loc = getFunction().getLoc();
-    auto graph_op = builder.create<CreateEngineOp>(
-        loc, op->getResultTypes(), op->getOperands(), true);
+    auto graph_op = builder.create<infrt::pd::GraphOp>(
+        loc, op->getResultTypes(), op->getOperands());
 
     ::llvm::SmallVector<mlir::Value, 4> tblgen_repl_values;
     for (auto v :
@@ -54,7 +55,7 @@ void TRTOpTellerPass::runOnFunction() {
     graph_op.body().push_back(block);
     op->moveBefore(block, block->begin());
     builder.setInsertionPointToEnd(block);
-    builder.create<::infrt::dialect::ReturnOp>(loc, op->getResults());
+    builder.create<::infrt::ReturnOp>(loc, op->getResults());
   }
 }
 }  // namespace trt
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
index 471eafa9f9ba3..1cb08dc0a2161 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
@@ -14,8 +14,6 @@
 
 #pragma once
 #include <mlir/Pass/Pass.h>
-#include "paddle/infrt/dialect/infrt_base.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -26,30 +24,28 @@ namespace trt {
  *
  * source func:
  *
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
  *  %c = "pd.conv2d"(%a) ...
  *  %d = "pd.conv3d"(%c) ...
  *  %f = "pd.conv2d"(%a) ...
- *  "pd.fetch" (%d, %f)
+ *  infrt.return %d, %f: ...
  * }
  *
  * destination func:
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %c = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *  %c = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
- *     "Infrt.return" (%m)
+ *     infrt.return %m:...
  *  } ...
- *  %d = "trt.create_engine"(%c) {
+ *  %d = "pd.graph"(%c) {
  *      %m = "pd.conv3d"(%c)...
- *      "Infrt.return" (%m)
+ *      infrt.return %m:...
  *  } ...
- *  %f = "trt.create_engine"(%a) {
+ *  %f = "pd.graph"(%a) {
  *      %m = "pd.conv2d"(%a)...
- *      "Infrt.return" (%m)
+ *      infrt.return %m:...
  *  } ...
- *  "pd.fetch" (%d, %f)
+ *  infrt.return %d, %f:...
  * }
  * TODO(winter-wang): Supplementary how to judge the operators can be supported
  * by tensorrt.
@@ -57,9 +53,7 @@ namespace trt {
 class TRTOpTellerPass
     : public mlir::PassWrapper<TRTOpTellerPass, mlir::FunctionPass> {
  public:
-  void getDependentDialects(mlir::DialectRegistry &registry) const override {
-    registry.insert<TensorRTDialect, ::infrt::dialect::INFRTDialect>();
-  }
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {}
   ::llvm::StringRef getName() const override { return "trtOpTellerPass"; }
   void runOnFunction() override;
 };
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc
index f179939e23206..415a78a6967ab 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc
@@ -11,7 +11,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 #include <mlir/IR/DialectImplementation.h>
 #include <mlir/IR/Matchers.h>
@@ -19,11 +18,24 @@
 #include <mlir/IR/PatternMatch.h>
 #include <mlir/Interfaces/CallInterfaces.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
-#include "paddle/infrt/dialect/tensorrt/trt_dilaect_types.h"
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
+
+#include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
 
 namespace infrt {
 namespace trt {
 
+EngineType EngineType::get() {
+  return Base::get(::infrt::Global::getMLIRContext());
+}
+
+EngineType EngineType::get(mlir::MLIRContext *context) {
+  return Base::get(context);
+}
+
 TensorRTDialect::TensorRTDialect(mlir::MLIRContext *context)
     : mlir::Dialect("trt", context, mlir::TypeID::get<TensorRTDialect>()) {
   addTypes<EngineType>();
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h
index 978b9906e5f52..76768037dbdb3 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.h
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.h
@@ -28,7 +28,9 @@
 #include <mlir/Interfaces/InferTypeOpInterface.h>
 #include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
-#include "paddle/infrt/dialect/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 
 namespace infrt {
 namespace trt {
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.td b/paddle/infrt/dialect/tensorrt/trt_ops.td
index 31142a5157bfc..803a11ed5b7e5 100755
--- a/paddle/infrt/dialect/tensorrt/trt_ops.td
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.td
@@ -7,14 +7,43 @@ include "mlir/Interfaces/CallInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/tensorrt/trt_op_base.td"
 
-def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<"::infrt::dialect::ReturnOp">]> {
-  let summary = "trt Graph Op";
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
+include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
+
+def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> {
+  let summary = "trt CreateEngine Op";
   let description = [{
     Describe a tensorrt subgraph.
   }];
   let regions = (region SizedRegion<1>:$body);
-  let arguments = (ins Variadic<TRT_Tensor>:$inputs, DefaultValuedAttr<BoolAttr, "true">:$run_once);
-  let results = (outs Variadic<TRT_Tensor>:$outputs);
+  let arguments = (ins Variadic<DenseTensor>:$inputs, DefaultValuedAttr<BoolAttr, "true">:$run_once);
+  let results = (outs TRT_EngineType:$engine);
+}
+
+def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> {
+  let summary = "trt execute Op";
+  let description = [{
+    Describe a tensorrt runtime.
+  }];
+  let arguments = (ins TRT_EngineType:$engine, Variadic<DenseTensor>:$inputs);
+  let results = (outs Variadic<DenseTensor>:$output);
+}
+
+def TRT_EngineComputeOp : TRT_Op<"compute", [NoSideEffect]> {
+  let summary = "trt compute engine";
+  let description = [{
+    execute engine
+  }];
+  let arguments = (ins TRT_EngineType:$engine, Context:$context);
+  let results = (outs DenseTensorList:$outputs);
+}
+
+def TRT_InspectEngineOp : TRT_Op<"inspect_engine", [NoSideEffect]> {
+  let summary = "trt inspect engine";
+  let description = [{
+    Show engine
+  }];
+  let arguments = (ins TRT_EngineType:$engine);
 }
 
 def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> {
@@ -24,11 +53,44 @@ def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> {
     TensorRT IActivationLayer.
     
   }];
-  let arguments = (ins  TRT_Tensor:$input, SI32Attr:$activation_type,
+  let arguments = (ins  DenseTensor:$input, SI32Attr:$activation_type,
                         DefaultValuedAttr<F32Attr, "0.0">:$alpha,
                         DefaultValuedAttr<F32Attr, "0.0">:$beta);
 
-  let results = (outs TRT_Tensor:$output);
+  let results = (outs DenseTensor:$output);
+}
+
+def TRT_FullyConnectedOp : TRT_Op<"FullyConnected", [NoSideEffect]> {
+  let summary = "TensorRT IFullyConnectedLayer";
+  let description = [{
+    TensorRT IFullyConnectedLayer
+  }];
+  let arguments = (ins
+    DenseTensor:$input_tensor,
+    DenseTensor:$kernel_weights,
+    DenseTensor:$bias_weights,
+    SI32Attr:$out_channel_num
+  );
+  let results = (outs
+    DenseTensor:$output_tensor
+  );
+}
+
+def TRT_ConvolutionOp : TRT_Op<"Convolution", [NoSideEffect]> {
+  let summary = "TensorRT IConvolutionLayer";
+  let description = [{
+    TensorRT IConvolutionLayer
+  }];
+  let arguments = (ins
+    DenseTensor:$input_tensor,
+    DenseTensor:$kernel_weights,
+    DenseTensor:$bias_weights,
+    SI32Attr:$out_channel_num,
+    I32ArrayAttr:$kernel_size
+  );
+  let results = (outs
+    DenseTensor:$output_tensor
+  );
 }
 
 def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> {
@@ -38,9 +100,9 @@ def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> {
     TensorRT IElementWiseLayer.
     
   }];
-  let arguments = (ins  TRT_Tensor:$input1, TRT_Tensor:$input2, SI32Attr:$elementwise_operation);
+  let arguments = (ins  DenseTensor:$input1, DenseTensor:$input2, SI32Attr:$elementwise_operation);
 
-  let results = (outs TRT_Tensor:$output);
+  let results = (outs DenseTensor:$output);
 }
 
 def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> {
@@ -50,10 +112,10 @@ def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> {
     TensorRT IMatrixMultiplyLayer.
     
   }];
-  let arguments = (ins  TRT_Tensor:$input1, BoolAttr:$transpose1,
-                        TRT_Tensor:$input2, BoolAttr:$transpose2);
+  let arguments = (ins  DenseTensor:$input1, BoolAttr:$transpose1,
+                        DenseTensor:$input2, BoolAttr:$transpose2);
 
-  let results = (outs TRT_Tensor:$output);
+  let results = (outs DenseTensor:$output);
 }
 
 #endif  // TRT_OPS
diff --git a/paddle/infrt/external_kernels/basic.mlir b/paddle/infrt/external_kernels/basic.mlir
index 1a7ea854c9ce4..843b12ced21a9 100644
--- a/paddle/infrt/external_kernels/basic.mlir
+++ b/paddle/infrt/external_kernels/basic.mlir
@@ -1,7 +1,7 @@
 // CHECK: basic
 func @basic() -> f32 {
-  %v0 = Infrt.constant.f32 1.0
-  %v1 = Infrt.constant.f32 2.0
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
   %v2 = "external.add.f32"(%v0, %v1) : (f32, f32) -> f32
 
   // CHECK: 1
@@ -17,5 +17,5 @@ func @basic() -> f32 {
   // CHECK: 6
   "external.print.f32"(%v3) : (f32) -> ()
 
-  Infrt.return %v3 : f32
+  infrt.return %v3 : f32
 }
diff --git a/paddle/infrt/external_kernels/fc.mlir b/paddle/infrt/external_kernels/fc.mlir
index b0cabddc3ebc4..26b2d24cace70 100644
--- a/paddle/infrt/external_kernels/fc.mlir
+++ b/paddle/infrt/external_kernels/fc.mlir
@@ -1,43 +1,43 @@
 // CHECK-LABEL: @fc
-func @fc(%input : !Infrt.tensor<X86, NCHW, F32>,
-         %w : !Infrt.tensor<X86, NCHW, F32>,
-         %bias : !Infrt.tensor<X86, NCHW, F32>) -> !Infrt.tensor<X86, NCHW, F32>
+func @fc(%input : !infrt.dense_tensor<CPU, FP32, NCHW>,
+         %w : !infrt.dense_tensor<CPU, FP32, NCHW>,
+         %bias : !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
 {
-  %out = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor<X86, NCHW, F32>
-  // dt.fill_tensor_with_constant.f32 (%out : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  %out = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  // dt.fill_tensor_with_constant.f32 (%out : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=0.0:f32}
 
   // fc1
-  "external.matmul"(%input, %w, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.sigmoid"(%out, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.matmul"(%input, %w, %out) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
 
   // fc2
-  "external.matmul"(%out, %w, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.sigmoid"(%out, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.matmul"(%out, %w, %out) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
 
-  Infrt.return %out : !Infrt.tensor<X86, NCHW, F32>
+  infrt.return %out : !infrt.dense_tensor<CPU, FP32, NCHW>
 }
 
 // CHECK-LABEL: @benchmark
 func @benchmark() {
-  %input = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %input = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%input : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=1.0:f32}
 
-  %w = dt.create_uninit_tensor.f32 [50, 50] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%w : !Infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
+  %w = dt.create_uninit_tensor.f32 [50, 50] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%w : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=2.0:f32}
 
-  %bias = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%bias : !Infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
+  %bias = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%bias : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=3.0:f32}
 
-  Infrt.benchmark "add.f32"(
-          %input:!Infrt.tensor<X86, NCHW, F32>,
-          %w:!Infrt.tensor<X86, NCHW, F32>,
-          %bias:!Infrt.tensor<X86, NCHW, F32>)
+  infrt.benchmark "add.f32"(
+          %input:!infrt.dense_tensor<CPU, FP32, NCHW>,
+          %w:!infrt.dense_tensor<CPU, FP32, NCHW>,
+          %bias:!infrt.dense_tensor<CPU, FP32, NCHW>)
           duration_secs = 100, max_count = 300000, num_warmup_runs = 3
   {
-    %res = Infrt.call @fc(%input, %w, %bias) : (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> (!Infrt.tensor<X86, NCHW, F32>)
-    Infrt.return %res : !Infrt.tensor<X86, NCHW, F32>
+    %res = infrt.call @fc(%input, %w, %bias) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+    infrt.return %res : !infrt.dense_tensor<CPU, FP32, NCHW>
   }
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/external_kernels/paddle.mlir b/paddle/infrt/external_kernels/paddle.mlir
index d55d9904b5bc4..97781e5c8c5e5 100644
--- a/paddle/infrt/external_kernels/paddle.mlir
+++ b/paddle/infrt/external_kernels/paddle.mlir
@@ -1,50 +1,50 @@
 // CHECK: paddle_func
 func @paddle_func() -> () {
-  %input = dt.create_uninit_tensor.f32 [3, 5] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %input = dt.create_uninit_tensor.f32 [3, 5] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%input : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=1.0:f32}
 
-  %w = dt.create_uninit_tensor.f32 [5, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%w : !Infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
+  %w = dt.create_uninit_tensor.f32 [5, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%w : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=2.0:f32}
 
-  %bias = dt.create_uninit_tensor.f32 [4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%bias : !Infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
+  %bias = dt.create_uninit_tensor.f32 [4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%bias : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=3.0:f32}
 
-  %out = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  %out = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%out : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=0.0:f32}
 
-  "external.fc2"(%input, %w, %bias, %out) {in_num_col_dims=3:i32, test_attr=5:i32}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.fc2"(%input, %w, %bias, %out) {in_num_col_dims=3:i32, test_attr=5:i32}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
   // CHECK-LABEL: tensor: shape=shape[3,5], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-  dt.print_tensor (%input : !Infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%input : !infrt.dense_tensor<CPU, FP32, NCHW>)
   // CHECK-LABEL: tensor: shape=shape[5,4], values=[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
-  dt.print_tensor (%w : !Infrt.tensor<X86, NCHW, F32>)
-  dt.print_tensor (%bias : !Infrt.tensor<X86, NCHW, F32>)
-  dt.print_tensor (%out : !Infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%w : !infrt.dense_tensor<CPU, FP32, NCHW>)
+  dt.print_tensor (%bias : !infrt.dense_tensor<CPU, FP32, NCHW>)
+  dt.print_tensor (%out : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
   // test external.matmul
-  %out1 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out1 : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
-  "external.matmul"(%input, %w, %out1) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  dt.print_tensor (%out1 : !Infrt.tensor<X86, NCHW, F32>)
+  %out1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%out1 : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=0.0:f32}
+  "external.matmul"(%input, %w, %out1) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  dt.print_tensor (%out1 : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
   // test external.elementwise_add
-  %out2 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out2 : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
-  %bias1 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%bias1 : !Infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
-  "external.elementwise_add"(%out1, %bias1, %out2) {axis=-1}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  dt.print_tensor (%out2 : !Infrt.tensor<X86, NCHW, F32>)
+  %out2 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%out2 : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=0.0:f32}
+  %bias1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%bias1 : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=3.0:f32}
+  "external.elementwise_add"(%out1, %bias1, %out2) {axis=-1}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  dt.print_tensor (%out2 : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
   // test external.relu
-  %out3 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out3 : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
-  "external.relu"(%out1, %out3) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  dt.print_tensor (%out3 : !Infrt.tensor<X86, NCHW, F32>)
+  %out3 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%out3 : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=0.0:f32}
+  "external.relu"(%out1, %out3) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  dt.print_tensor (%out3 : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
   // test external.sigmoid
-  %out4 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out4 : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
-  "external.sigmoid"(%out1, %out4) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  dt.print_tensor (%out4 : !Infrt.tensor<X86, NCHW, F32>)
+  %out4 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%out4 : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=0.0:f32}
+  "external.sigmoid"(%out1, %out4) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  dt.print_tensor (%out4 : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/host_context/kernel_registry.cc b/paddle/infrt/host_context/kernel_registry.cc
index f343dfc71b040..4209b2a9648d8 100644
--- a/paddle/infrt/host_context/kernel_registry.cc
+++ b/paddle/infrt/host_context/kernel_registry.cc
@@ -23,8 +23,9 @@ namespace infrt {
 namespace host_context {
 
 struct KernelRegistry::Impl {
-  std::unordered_map<std::string, KernelImplementation> data;
-  std::unordered_map<std::string, llvm::SmallVector<std::string, 4>> attr_names;
+  std::unordered_map<std::string,
+                     std::pair<KernelImplementation, std::vector<const char *>>>
+      data;
 };
 
 KernelRegistry::KernelRegistry() : impl_(std::make_unique<Impl>()) {}
@@ -33,20 +34,29 @@ void KernelRegistry::AddKernel(const std::string &key,
                                KernelImplementation fn) {
   CHECK(!impl_->data.count(key)) << "kernel [" << key
                                  << "] is registered twice";
-  impl_->data.emplace(key, fn);
+  impl_->data.emplace(
+      key, std::make_pair(std::move(fn), std::vector<const char *>{}));
 }
 
-void KernelRegistry::AddKernelAttrNameList(
-    const std::string &key, const std::vector<std::string> &names) {
-  CHECK(!impl_->attr_names.count(key))
-      << "kernel [" << key << "] is registered twice in attribute names";
-  impl_->attr_names.emplace(
-      key, llvm::SmallVector<std::string, 4>(names.begin(), names.end()));
+const std::vector<const char *> &KernelRegistry::GetAttrNameList(
+    const std::string &key) const {
+  CHECK(impl_->data.count(key));
+  return impl_->data[key].second;
+}
+
+void KernelRegistry::AddKernelWithAttrs(
+    const std::string &key,
+    KernelImplementation fn,
+    std::vector<const char *> &&attr_order) {
+  CHECK(!impl_->data.count(key)) << "kernel [" << key
+                                 << "] is registered twice";
+  impl_->data.emplace(key,
+                      std::make_pair(std::move(fn), std::move(attr_order)));
 }
 
 KernelImplementation KernelRegistry::GetKernel(const std::string &key) const {
   auto it = impl_->data.find(key);
-  return it != impl_->data.end() ? it->second : KernelImplementation{};
+  return it != impl_->data.end() ? it->second.first : KernelImplementation{};
 }
 
 std::vector<std::string> KernelRegistry::GetKernelList() const {
diff --git a/paddle/infrt/host_context/kernel_registry.h b/paddle/infrt/host_context/kernel_registry.h
index a813f690efb0b..a146b2b3c4c1e 100644
--- a/paddle/infrt/host_context/kernel_registry.h
+++ b/paddle/infrt/host_context/kernel_registry.h
@@ -34,10 +34,14 @@ class KernelRegistry {
   KernelRegistry();
 
   void AddKernel(const std::string &key, KernelImplementation fn);
-  void AddKernelAttrNameList(const std::string &key,
-                             const std::vector<std::string> &names);
+  void AddKernelWithAttrs(const std::string &key,
+                          KernelImplementation fn,
+                          std::vector<const char *> &&attrs_order);
 
   KernelImplementation GetKernel(const std::string &key) const;
+  const std::vector<const char *> &GetAttrNameList(
+      const std::string &key) const;
+
   std::vector<std::string> GetKernelList() const;
 
   size_t size() const;
diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc
index 7823681079f67..81bf873ddf0cf 100644
--- a/paddle/infrt/host_context/mlir_exec.cc
+++ b/paddle/infrt/host_context/mlir_exec.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include <llvm/Support/CommandLine.h>
-
+#include <mlir/Pass/PassManager.h>
 #include <iostream>
 #include <string>
 
@@ -29,9 +29,14 @@
 #include "paddle/infrt/kernel/tensor_shape_kernels.h"
 #include "paddle/infrt/kernel/test_kernels.h"
 #ifdef INFRT_WITH_PHI
+#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
+#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
 #include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
 #include "paddle/infrt/kernel/phi/registry.h"
-#endif
+#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT)
+#include "paddle/infrt/kernel/tensorrt/registry.h"
+#endif  // INFRT_WITH_GPU && INFRT_WITH_TRT
+#endif  // INFRT_WITH_PHI
 
 static llvm::cl::list<std::string> cl_shared_libs(  // NOLINT
     "shared_libs",
@@ -60,6 +65,9 @@ int main(int argc, char** argv) {
 #ifdef INFRT_WITH_PHI
   kernel::RegisterPhiKernels(&registry);
   kernel::RegisterInferShapeLaunchers(&registry);
+#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT)
+  kernel::RegisterTrtKernels(&registry);
+#endif  // INFRT_WITH_GPU && INFRT_WITH_TRT
 #endif
 
   // load extra shared library
@@ -81,6 +89,24 @@ int main(int argc, char** argv) {
     }
   }
 
+  context->loadAllAvailableDialects();
+  mlir::PassManager pm(context);
+
+#ifdef INFRT_WITH_PHI
+  mlir::OpPassManager& phi_pass_manager = pm.nest<mlir::FuncOp>();
+
+  std::vector<infrt::Place> valid_places = {{infrt::TargetType::CPU,
+                                             infrt::PrecisionType::FLOAT32,
+                                             infrt::LayoutType::NCHW}};
+  phi_pass_manager.addPass(infrt::createPhiOpCvtPass(valid_places));
+  phi_pass_manager.addPass(infrt::createInfrtOpFusePass());
+#endif
+
+  if (mlir::failed(pm.run(*module))) {
+    std::cout << "\npass failed!\n" << std::endl;
+    return 4;
+  }
+
   host_context::TestMlir(module.get(), &registry);
 
   std::cout << std::endl;
diff --git a/paddle/infrt/host_context/mlir_function_executable.cc b/paddle/infrt/host_context/mlir_function_executable.cc
index 47ec27ebec300..ec8d43f99bae7 100644
--- a/paddle/infrt/host_context/mlir_function_executable.cc
+++ b/paddle/infrt/host_context/mlir_function_executable.cc
@@ -43,6 +43,7 @@ MlirFunctionExecutable::MlirFunctionExecutable(
                func_op.getNumResults()),
       MlirToRuntimeTranslator(&core_runtime_builder_),
       region_(&func_op.getRegion()),
+      kernel_registry_(kernel_registry),
       core_runtime_builder_(kernel_registry),
       function_table_(function_table) {}
 
@@ -54,6 +55,7 @@ MlirFunctionExecutable::MlirFunctionExecutable(
     : Function("", func_type.getNumInputs(), func_type.getNumResults()),
       MlirToRuntimeTranslator(&core_runtime_builder_),
       region_(region),
+      kernel_registry_(kernel_registry),
       core_runtime_builder_(kernel_registry),
       function_table_(function_table) {}
 
@@ -90,7 +92,7 @@ void MlirFunctionExecutable::BuildExecutables(
 
     if (EmitCallOp(&op, &function_table_)) continue;
 
-    if (EmitGeneralOp(&op)) continue;
+    if (EmitGeneralOp(&op, *kernel_registry_)) continue;
     LOG(FATAL) << "Not supported op: " << DumpToString(op);
   }
 
diff --git a/paddle/infrt/host_context/mlir_function_executable.h b/paddle/infrt/host_context/mlir_function_executable.h
index a6428df86e6b2..cd9161d01bbf6 100644
--- a/paddle/infrt/host_context/mlir_function_executable.h
+++ b/paddle/infrt/host_context/mlir_function_executable.h
@@ -70,6 +70,7 @@ class MlirFunctionExecutable : public Function, public MlirToRuntimeTranslator {
 
  private:
   mlir::Region* region_{};
+  KernelRegistry* kernel_registry_{};
   CoreRuntimeBuilder core_runtime_builder_;
   MlirToRuntimeTranslator::function_defs_t& function_table_;
   std::function<void()> copy_res_fn_;
diff --git a/paddle/infrt/host_context/mlir_tests/basic.mlir b/paddle/infrt/host_context/mlir_tests/basic.mlir
index 1b55b408f2b08..263d5884134b1 100644
--- a/paddle/infrt/host_context/mlir_tests/basic.mlir
+++ b/paddle/infrt/host_context/mlir_tests/basic.mlir
@@ -1,30 +1,30 @@
 // CHECK-LABEL: basic
 func @basic() -> f32 {
-  %v0 = Infrt.constant.f32 1.0
-  %v1 = Infrt.constant.f32 2.0
-  %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
 
   // CHECK: 1
-  "Infrt.print.f32"(%v0) : (f32) -> ()
+  "infrt.print.f32"(%v0) : (f32) -> ()
   // CHECK: 2
-  "Infrt.print.f32"(%v1) : (f32) -> ()
+  "infrt.print.f32"(%v1) : (f32) -> ()
 
   // CHECK: 3
-  "Infrt.print.f32"(%v2) : (f32) -> ()
+  "infrt.print.f32"(%v2) : (f32) -> ()
 
-  %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
 
   // CHECK: 6
-  "Infrt.print.f32"(%v3) : (f32) -> ()
+  "infrt.print.f32"(%v3) : (f32) -> ()
 
-  Infrt.return %v3 : f32
+  infrt.return %v3 : f32
 }
 
 // CHECK-LABEL: basic1
 // Check the mlir executor can work with more than one function in a file.
 func @basic1() -> () {
-  %v0 = Infrt.constant.f32 1.0
-  "Infrt.print.f32"(%v0) : (f32) -> ()
+  %v0 = infrt.constant.f32 1.0
+  "infrt.print.f32"(%v0) : (f32) -> ()
   // CHECK: 1
-  Infrt.return
+  infrt.return
 }
\ No newline at end of file
diff --git a/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
index 5a973a3eb23e6..1a7fa28f1e58b 100644
--- a/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
+++ b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
@@ -1,9 +1,9 @@
 // CHECK-LABEL: build_tensor1
 func @build_tensor1() {
-  %a = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%a : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%a : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=1.0:f32}
   // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-  dt.print_tensor (%a : !Infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/host_context/mlir_tests/shape.mlir b/paddle/infrt/host_context/mlir_tests/shape.mlir
index 22df1c8010d8d..691ce62cbf82a 100644
--- a/paddle/infrt/host_context/mlir_tests/shape.mlir
+++ b/paddle/infrt/host_context/mlir_tests/shape.mlir
@@ -3,5 +3,5 @@ func @build_tensor1() {
   %a = ts.build_shape [1:i64, 57:i64, 92:i64]
   // CHECK: shape[1,57,92]
   ts.print_shape %a
-  Infrt.return
-}
\ No newline at end of file
+  infrt.return
+}
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
index 17e6f7cb563d2..7e90f225cffa7 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
@@ -16,12 +16,14 @@
 
 #include <llvm/Support/SourceMgr.h>
 #include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/BuiltinAttributes.h>
 #include <mlir/IR/BuiltinOps.h>
 #include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Diagnostics.h>
 #include <mlir/IR/OperationSupport.h>
 #include <mlir/Parser.h>
 
+#include <glog/logging.h>
 #include <iostream>
 #include <memory>
 #include <string>
@@ -42,6 +44,13 @@
 #include "paddle/infrt/host_context/value.h"
 #include "paddle/infrt/tensor/tensor_shape.h"
 
+#ifdef INFRT_WITH_PHI
+#ifdef INFRT_WITH_TRT
+#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
+#endif
+#include "paddle/phi/core/dense_tensor.h"
+#endif
+
 namespace infrt {
 namespace host_context {
 
@@ -75,7 +84,7 @@ struct MlirToRuntimeTranslator::Impl {
 };
 
 bool MlirToRuntimeTranslator::EmitConstantOp(mlir::Operation* op) {
-  if (!infrt::Startswith(op->getName().getStringRef().str(), "Infrt.constant"))
+  if (!infrt::Startswith(op->getName().getStringRef().str(), "infrt.constant"))
     return false;
   VLOG(3) << "Emitting constant op [" << op->getName().getStringRef().str()
           << "]";
@@ -173,6 +182,36 @@ boost::optional<double> MlirToRuntimeTranslator::EmitAttribute(
   return boost::none;
 }
 
+template <>
+boost::optional<::infrt::TargetType> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute& attr) {
+  if (!attr.isa<::infrt::TargetAttr>()) return boost::none;
+  if (attr.isa<::infrt::TargetAttr>()) {
+    return attr.cast<::infrt::TargetAttr>().getTarget();
+  }
+  return boost::none;
+}
+
+template <>
+boost::optional<::infrt::LayoutType> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute& attr) {
+  if (!attr.isa<::infrt::LayoutAttr>()) return boost::none;
+  if (attr.isa<::infrt::LayoutAttr>()) {
+    return attr.cast<::infrt::LayoutAttr>().getLayout();
+  }
+  return boost::none;
+}
+
+template <>
+boost::optional<::infrt::PrecisionType> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute& attr) {
+  if (!attr.isa<::infrt::PrecisionAttr>()) return boost::none;
+  if (attr.isa<::infrt::PrecisionAttr>()) {
+    return attr.cast<::infrt::PrecisionAttr>().getPrecision();
+  }
+  return boost::none;
+}
+
 template <>
 boost::optional<std::string> MlirToRuntimeTranslator::EmitAttribute(
     const mlir::Attribute& attr) {
@@ -237,102 +276,161 @@ boost::optional<std::vector<double>> MlirToRuntimeTranslator::EmitAttribute(
 }
 
 static bool IsReturn(mlir::Operation* op) {
-  return op->getName().getStringRef() == "Infrt.return";
+  return op->getName().getStringRef() == "infrt.return";
 }
 
-bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
+bool MlirToRuntimeTranslator::EmitGeneralOp(
+    mlir::Operation* op, const KernelRegistry& kernel_registry) {
   CHECK(impl_->runtime);
   impl_->cur_op =
       impl_->runtime->NewOpExecutable(op->getName().getStringRef().str());
 
   VLOG(3) << "processing general op : " << op->getName().getStringRef().str();
+  // TODO(wilber): Find a more appropriate way to handle special cases.
+  if (op->getName().getStringRef() == "trt.create_engine") {
+#ifdef INFRT_WITH_TRT
+    auto* symbols = impl_->runtime->symbol_table();
+    ::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol mlir_operation;
+    mlir_operation.operation = op;
+    mlir_operation.symbol_table = symbols;
+    impl_->cur_op->AppendArgument(new Value(mlir_operation));
+    // TODO(wilber): how to pass DenseTensor to create_engine op? temporialiy
+    // add a naive implement.
+    for (int i = 0, e = op->getNumOperands(); i < e; ++i) {
+      auto operand = op->getOperand(i);
+      Value* arg_value{nullptr};
+      if (operand.isa<mlir::BlockArgument>()) {
+        mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
+        arg_value = GetValue(arg);
+      } else {
+        arg_value = GetValue(operand);
+        if (!arg_value) {
+          auto upstream_op = operand.getDefiningOp();
+          arg_value = GetOpResult(upstream_op);
+        }
+      }
+      if (arg_value->is_type<phi::DenseTensor>()) {
+        impl_->runtime->FeedInArgs(
+            std::make_pair(std::to_string(i), ValueRef(arg_value)));
+      }
+    }
+#else
+    CHECK(false) << "should not reach here";
+#endif
+  } else {
+    // process operands
+    for (int i = 0, e = op->getNumOperands(); i < e; i++) {
+      // function argument as value
+      auto operand = op->getOperand(i);
+      /// if (operand.getKind() == mlir::Value::Kind::BlockArgument) {
+      if (operand.isa<mlir::BlockArgument>()) {
+        mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
+        Value* arg_value = GetValue(arg);
+        impl_->cur_op->AppendArgument(arg_value);
+        VLOG(3) << "* op mlir operand: " << DumpToString(arg) << " "
+                << GetValue(arg);
+        continue;
+      }
 
-  // process operands
-  for (int i = 0, e = op->getNumOperands(); i < e; i++) {
-    // function argument as value
-    auto operand = op->getOperand(i);
-    /// if (operand.getKind() == mlir::Value::Kind::BlockArgument) {
-    if (operand.isa<mlir::BlockArgument>()) {
-      mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
-      Value* arg_value = GetValue(arg);
+      // normal value
+      Value* arg_value = GetValue(operand);
+      if (!arg_value) {
+        auto upstream_op = operand.getDefiningOp();
+        arg_value = GetOpResult(upstream_op);
+      }
+      CHECK(arg_value) << "No-exist argument value found: "
+                       << DumpToString(operand);
       impl_->cur_op->AppendArgument(arg_value);
-      VLOG(3) << "* op mlir operand: " << DumpToString(arg) << " "
-              << GetValue(arg);
-      continue;
-    }
 
-    // normal value
-    Value* arg_value = GetValue(operand);
-    if (!arg_value) {
-      auto upstream_op = operand.getDefiningOp();
-      arg_value = GetOpResult(upstream_op);
+      VLOG(3) << "* op mlir operand: " << DumpToString(operand) << " "
+              << GetValue(operand) << " vs " << arg_value;
     }
-    CHECK(arg_value) << "No-exist argument value found: "
-                     << DumpToString(operand);
-    impl_->cur_op->AppendArgument(arg_value);
-
-    VLOG(3) << "* op mlir operand: " << DumpToString(operand) << " "
-            << GetValue(operand) << " vs " << arg_value;
   }
 
   // process attributes
   auto attrs = op->getAttrs();
 
+  // MLIR's underlying attr storage type is `Builtin_Dictionary`, and its
+  // elements are sorted by name. The following code adapts the order of
+  // function signatures of the phi operator library.
+  llvm::SmallVector<Value*, 4> tmp;
+  tmp.resize(attrs.size());
+  const std::string& kernel_name = op->getName().getStringRef().str();
+  const auto& attr_names = kernel_registry.GetAttrNameList(kernel_name);
+  if (attrs.size()) {
+    if (attr_names.empty()) {
+      LOG(WARNING) << "The kernel `" << kernel_name
+                   << "` has not been registered with "
+                      "`KernelRegistry::AddKernelWithAttrs()`.";
+    } else {
+      CHECK_EQ(attr_names.size(), attrs.size())
+          << "The number of kernel `" << kernel_name
+          << "` attributes specified by mlir (" << attrs.size()
+          << ") is inconsistent with the registration (" << attr_names.size()
+          << ").";
+    }
+  }
+
+  auto get_offset = [](const char* attr,
+                       const std::vector<const char*>& names,
+                       const std::string& kernel_name) -> int {
+    for (size_t i = 0; i < names.size(); ++i) {
+      if (!std::strcmp(attr, names[i])) {
+        return i;
+      }
+    }
+    LOG(WARNING) << "The attribute `" << attr << "` of kernel `" << kernel_name
+                 << "` is not properly registered with "
+                    "`KernelRegistry::AddKernelWithAttrs()`.";
+    return -1;
+  };
+
   for (size_t i = 0; i < attrs.size(); i++) {
     auto& attr = attrs[i];
+    int offset{};
+    if (attr_names.size()) {
+      offset = get_offset(attr.getName().data(), attr_names, kernel_name);
+    } else {
+      offset = i;
+    }
+    CHECK_GT(offset, -1);
     if (auto v = EmitAttribute<int32_t>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(*v));
+      tmp[offset] = new Value(*v);
     } else if (auto v = EmitAttribute<int64_t>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(*v));
+      tmp[offset] = new Value(*v);
     } else if (auto v = EmitAttribute<float>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(*v));
+      tmp[offset] = new Value(*v);
     } else if (auto v = EmitAttribute<double>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(*v));
+      tmp[offset] = new Value(*v);
     } else if (auto v = EmitAttribute<std::string>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+      tmp[offset] = new Value(std::move(*v));
     } else if (auto v = EmitAttribute<bool>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(*v));
+      tmp[offset] = new Value(*v);
+    } else if (auto v = EmitAttribute<::infrt::TargetType>(attr.getValue())) {
+      tmp[offset] = new Value(*v);
+    } else if (auto v =
+                   EmitAttribute<::infrt::PrecisionType>(attr.getValue())) {
+      tmp[offset] = new Value(*v);
+    } else if (auto v = EmitAttribute<::infrt::LayoutType>(attr.getValue())) {
+      tmp[offset] = new Value(*v);
     } else if (auto v = EmitAttribute<std::vector<int16_t>>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+      tmp[offset] = new Value(std::move(*v));
     } else if (auto v = EmitAttribute<std::vector<int32_t>>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+      tmp[offset] = new Value(std::move(*v));
     } else if (auto v = EmitAttribute<std::vector<int64_t>>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+      tmp[offset] = new Value(std::move(*v));
     } else if (auto v = EmitAttribute<std::vector<float>>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+      tmp[offset] = new Value(std::move(*v));
     } else if (auto v = EmitAttribute<std::vector<double>>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+      tmp[offset] = new Value(std::move(*v));
     } else {
       LOG(FATAL) << "Not supported attribute type";
     }
   }
 
-  // process results
-  llvm::SmallVector<Value*, 4> res_values;
-  for (int i = 0, e = op->getNumResults(); i < e; i++) {
-    auto res = op->getResult(i);
-    if (res.getType().isa<::infrt::DenseTensorType>()) {
-      auto r = impl_->value_map.try_emplace(
-          res, ValueRef(new Value{::phi::DenseTensor()}));
-      CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res)
-                      << "]";
-      res_values.push_back(r.first->second.get());
-    } else {
-      res_values.push_back(AddValue(res));
-    }
-
-    VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res);
-  }
-  impl_->cur_op->SetResults(res_values);
-
-#ifdef INFRT_DEBUG
-  {
-    VLOG(3) << "check result";
-    for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) {
-      VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i];
-    }
+  for (size_t i = 0; i < tmp.size(); i++) {
+    impl_->cur_op->AppendAttribute(tmp[i]);
   }
-#endif
 
   // process regions, we treat regions as attribute.
   auto num_regions = op->getNumRegions();
@@ -362,13 +460,40 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
     impl_->cur_op->AppendAttribute(new Value(function));
   }
 
+  // process results
+  llvm::SmallVector<Value*, 4> res_values;
+  for (int i = 0, e = op->getNumResults(); i < e; i++) {
+    auto res = op->getResult(i);
+    if (res.getType().isa<::infrt::DenseTensorType>()) {
+      auto r = impl_->value_map.try_emplace(
+          res, ValueRef(new Value{::phi::DenseTensor()}));
+      CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res)
+                      << "]";
+      res_values.push_back(r.first->second.get());
+    } else {
+      res_values.push_back(AddValue(res));
+    }
+
+    VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res);
+  }
+  impl_->cur_op->SetResults(res_values);
+
+#ifdef INFRT_DEBUG
+  {
+    VLOG(3) << "check result";
+    for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) {
+      VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i];
+    }
+  }
+#endif
+
   return true;
 }
 
 bool MlirToRuntimeTranslator::EmitReturnOp(
     mlir::Operation* op, llvm::SmallVectorImpl<mlir::Value>* results) {
   CHECK(results);
-  if (op->getName().getStringRef() == "Infrt.return") {
+  if (op->getName().getStringRef() == "infrt.return") {
     for (size_t i = 0; i < op->getNumOperands(); i++) {
       results->push_back(op->getOperand(i));
     }
@@ -441,7 +566,7 @@ bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op,
                                          function_defs_t* function_table) {
   CHECK(op);
   CHECK(function_table);
-  if (op->getName().getStringRef() != "Infrt.call") return false;
+  if (op->getName().getStringRef() != "infrt.call") return false;
 
   impl_->cur_op =
       impl_->runtime->NewOpExecutable(op->getName().getStringRef().str());
@@ -561,7 +686,7 @@ class MlirProgramTestExecutor : public MlirToRuntimeTranslator {
         llvm::SmallVector<mlir::Value, 3> results;
         if (EmitReturnOp(&op, &results)) continue;
         if (EmitCallOp(&op, &impl_->func_defs)) continue;
-        if (EmitGeneralOp(&op)) continue;
+        if (EmitGeneralOp(&op, *registry)) continue;
         LOG(FATAL) << "Not supported op: " << DumpToString(op);
       }
 
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.h b/paddle/infrt/host_context/mlir_to_runtime_translate.h
index 0c453651d9e6d..27a7f20168667 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.h
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.h
@@ -57,13 +57,14 @@ class MlirToRuntimeTranslator {
  protected:
   //! Emit a "infrt.constant.*" operation, return true if succeed.
   bool EmitConstantOp(mlir::Operation* op);
-  //! Emit a "Infrt.return" operation.
+  //! Emit a "infrt.return" operation.
   bool EmitReturnOp(mlir::Operation* op,
                     llvm::SmallVectorImpl<mlir::Value>* results);
   //! Emit a "ts.build_shape" operation.
   bool EmitBuildShapeOp(mlir::Operation* op);
   //! Emit an operation other than the special cases above.
-  bool EmitGeneralOp(mlir::Operation* op);
+  bool EmitGeneralOp(mlir::Operation* op,
+                     const KernelRegistry& kernel_registry);
   //! Emit all the functions.
   bool EmitFunctions();
 
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
index 5824e40abf97a..31615fbc3f6e4 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
@@ -37,14 +37,14 @@ TEST(MlirToRuntimeTranslate, basic) {
 
   auto source = R"ROC(
 func @main() -> () {
-  %v0 = Infrt.constant.f32 1.0
-  %v1 = Infrt.constant.f32 2.0
-  %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
-  %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
 
-  "Infrt.print.f32"(%v1) : (f32) -> ()
+  "infrt.print.f32"(%v1) : (f32) -> ()
 
-  Infrt.return
+  infrt.return
 }
 )ROC";
 
@@ -63,14 +63,14 @@ TEST(TestMlir, basic) {
 
   auto source = R"ROC(
 func @main() -> () {
-  %v0 = Infrt.constant.f32 1.0
-  %v1 = Infrt.constant.f32 2.0
-  %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
-  %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
 
-  "Infrt.print.f32"(%v1) : (f32) -> ()
+  "infrt.print.f32"(%v1) : (f32) -> ()
 
-  Infrt.return
+  infrt.return
 }
 )ROC";
 
@@ -101,7 +101,7 @@ func @predict(%a: !infrt.dense_tensor<CPU, FP32, NCHW>, %b: !infrt.dense_tensor<
       "!infrt.dense_tensor<CPU, FP32, NCHW>";
 
   auto end = R"ROC(
-Infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
+infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
 }
   )ROC";
 
diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc
index 83a2a4269c3e9..e161dc47075bb 100644
--- a/paddle/infrt/host_context/paddle_mlir.cc
+++ b/paddle/infrt/host_context/paddle_mlir.cc
@@ -13,16 +13,17 @@
 // limitations under the License.
 
 #include "paddle/infrt/host_context/paddle_mlir.h"
-#include "paddle/infrt/dialect/pd_ops_info.h"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd/common/pd_ops_info.h"
 
 MLIRModelGenImpl::MLIRModelGenImpl()
     : context_(infrt::Global::getMLIRContext()), builder_(context_) {
-  context_->allowUnregisteredDialects();
   context_->getOrLoadDialect<mlir::StandardOpsDialect>();
-  context_->getOrLoadDialect<infrt::dialect::INFRTDialect>();
   context_->getOrLoadDialect<infrt::ts::TensorShapeDialect>();
   context_->getOrLoadDialect<infrt::dt::DTDialect>();
-  context_->getOrLoadDialect<mlir::pd::PaddleDialect>();
+  context_->getOrLoadDialect<infrt::pd::PaddleDialect>();
+  context_->getOrLoadDialect<::infrt::InfrtDialect>();
   module_ = mlir::ModuleOp::create(mlir::UnknownLoc::get(context_));
 }
 
@@ -56,7 +57,6 @@ mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel(
   UpdateModelParams(program, &mainFunc);
   UpdateModelOps(program);
   UpdateModelOutputs(program);
-
   return module_;
 }
 
@@ -79,7 +79,7 @@ mlir::FuncOp MLIRModelGenImpl::UpdateModelModule(
 llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetModelInputsType(
     const infrt::paddle::framework_proto::ProgramDesc &program) {
   llvm::SmallVector<mlir::Type, 4> operandTypes;
-  operandTypes.push_back(infrt::dt::TensorMapType::get(context_));
+  operandTypes.push_back(infrt::DenseHostTensorMapType::get(context_));
   for (auto &op_desc : main_block_.ops()) {
     if (op_desc.type() != "feed") continue;
     for (int var_idx = 0; var_idx < op_desc.outputs_size(); ++var_idx) {
@@ -91,11 +91,15 @@ llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetModelInputsType(
         if (var_desc.name() == input_var_name) {
           std::vector<int64_t> dims = RepeatedToVector<int64_t>(
               var_desc.type().lod_tensor().tensor().dims());
-          mlir::Type precision_;
-          ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(),
-                          builder_,
-                          &precision_);
-          mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_);
+          infrt::PrecisionType precision_;
+          ConvertDataTypeToPhi(
+              var_desc.type().lod_tensor().tensor().data_type(), &precision_);
+          mlir::Type type_ =
+              infrt::DenseTensorType::get(context_,
+                                          infrt::TargetType::CPU,
+                                          precision_,
+                                          infrt::LayoutType::ANY);
+
           operandTypes.push_back(type_);
         }
       }
@@ -117,11 +121,14 @@ llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetModelOutputsType(
         if (var_desc.name() == input_var_name) {
           std::vector<int64_t> dims = RepeatedToVector<int64_t>(
               var_desc.type().lod_tensor().tensor().dims());
-          mlir::Type precision_;
-          ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(),
-                          builder_,
-                          &precision_);
-          mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_);
+          infrt::PrecisionType precision_;
+          ConvertDataTypeToPhi(
+              var_desc.type().lod_tensor().tensor().data_type(), &precision_);
+          mlir::Type type_ =
+              infrt::DenseTensorType::get(context_,
+                                          infrt::TargetType::CPU,
+                                          precision_,
+                                          infrt::LayoutType::ANY);
           resultTypes.push_back(type_);
         }
       }
@@ -168,11 +175,11 @@ void MLIRModelGenImpl::UpdateModelParams(
       auto name = builder_.getStringAttr(var_desc.name());
       std::vector<int64_t> dims = RepeatedToVector<int64_t>(
           var_desc.type().lod_tensor().tensor().dims());
-      mlir::Type precision_;
-      ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(),
-                      builder_,
-                      &precision_);
-      mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_);
+      infrt::PrecisionType precision_;
+      ConvertDataTypeToPhi(var_desc.type().lod_tensor().tensor().data_type(),
+                           &precision_);
+      mlir::Type type_ = infrt::DenseTensorType::get(
+          context_, infrt::TargetType::CPU, precision_, infrt::LayoutType::ANY);
       auto op = builder_.create<infrt::dt::TensorMapGetTensorOp>(
           mlir::UnknownLoc::get(context_), type_, map, name);
       params_map_.insert(std::pair<std::string, mlir::Value>(
@@ -198,8 +205,9 @@ void MLIRModelGenImpl::UpdateModelOutputs(
 
         llvm::SmallVector<mlir::Type, 4> resultTypes;
         llvm::SmallVector<mlir::NamedAttribute, 4> attrs;
+
         mlir::OperationState state(loc,
-                                   mlir::ReturnOp::getOperationName(),
+                                   ::infrt::ReturnOp::getOperationName(),
                                    operands,
                                    resultTypes,
                                    attrs);
@@ -257,11 +265,13 @@ llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetOpOutputType(
       if (var_desc.name() == var_name) {
         std::vector<int64_t> dims = RepeatedToVector<int64_t>(
             var_desc.type().lod_tensor().tensor().dims());
-        mlir::Type precision_;
-        ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(),
-                        builder_,
-                        &precision_);
-        mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_);
+        infrt::PrecisionType precision_;
+        ConvertDataTypeToPhi(var_desc.type().lod_tensor().tensor().data_type(),
+                             &precision_);
+        mlir::Type type_ = infrt::DenseTensorType::get(context_,
+                                                       infrt::TargetType::CPU,
+                                                       precision_,
+                                                       infrt::LayoutType::ANY);
         resultTypes.push_back(type_);
       }
     }
@@ -322,7 +332,7 @@ llvm::SmallVector<mlir::NamedAttribute, 4> MLIRModelGenImpl::GetOpAttributes(
     switch (type) {
       ATTR_IMPL_CASE(FLOAT, f, getF32FloatAttr);
       ATTR_IMPL_CASE(BOOLEAN, b, getBoolAttr);
-      ATTR_IMPL_CASE(INT, i, getI32IntegerAttr);
+      ATTR_IMPL_CASE(INT, i, getSI32IntegerAttr);
       ATTR_IMPL_CASE(LONG, l, getI64IntegerAttr);
       ATTR_IMPL_CASE(STRING, s, getStringAttr);
 
@@ -398,3 +408,38 @@ bool ConvertDataType(infrt::paddle::framework_proto::VarType::Type dtype,
       return false;
   }
 }
+
+bool ConvertDataTypeToPhi(infrt::paddle::framework_proto::VarType::Type dtype,
+                          infrt::PrecisionType *type) {
+  switch (dtype) {
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_FP16:
+      *type = infrt::PrecisionType::FLOAT16;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_FP32:
+      *type = infrt::PrecisionType::FLOAT32;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_FP64:
+      *type = infrt::PrecisionType::FLOAT64;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_BOOL:
+      *type = infrt::PrecisionType::BOOL;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT8:
+      *type = infrt::PrecisionType::INT8;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT16:
+      *type = infrt::PrecisionType::INT16;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT32:
+      *type = infrt::PrecisionType::INT32;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT64:
+      *type = infrt::PrecisionType::INT64;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_UINT8:
+      *type = infrt::PrecisionType::UINT8;
+      return true;
+    default:
+      return false;
+  }
+}
diff --git a/paddle/infrt/host_context/paddle_mlir.h b/paddle/infrt/host_context/paddle_mlir.h
index 78dfefcfda2c8..a351b5cf80e23 100644
--- a/paddle/infrt/host_context/paddle_mlir.h
+++ b/paddle/infrt/host_context/paddle_mlir.h
@@ -14,22 +14,22 @@
 #ifndef PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_
 #define PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_
 
+#include <llvm/Support/CommandLine.h>
+#include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/AsmState.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/MLIRContext.h>
 #include <fstream>
 #include <iostream>
 #include <string>
 
-#include "llvm/Support/CommandLine.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/AsmState.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/MLIRContext.h"
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/common/string.h"
-#include "paddle/infrt/dialect/basic_kernels.h"
 #include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/infrt_base.h"
-#include "paddle/infrt/dialect/init_infrt_dialects.h"
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+#include "paddle/infrt/dialect/init_dialects.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 #include "paddle/infrt/dialect/tensor_shape.h"
 #include "paddle/infrt/paddle/model_parser.h"
 
@@ -102,4 +102,7 @@ inline std::vector<T> RepeatedToVector(
 bool ConvertDataType(infrt::paddle::framework_proto::VarType::Type dtype,
                      mlir::Builder builder,
                      mlir::Type *type);
+bool ConvertDataTypeToPhi(infrt::paddle::framework_proto::VarType::Type dtype,
+                          infrt::PrecisionType *type);
+
 #endif  // PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_
diff --git a/paddle/infrt/host_context/value.cc b/paddle/infrt/host_context/value.cc
index 3f40490557290..822ee108c897c 100644
--- a/paddle/infrt/host_context/value.cc
+++ b/paddle/infrt/host_context/value.cc
@@ -24,14 +24,6 @@ ValueRef::ValueRef(int64_t val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(float val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(double val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(bool val) : Shared<Value>(new Value(val)) {}
-ValueRef::ValueRef(backends::CpuPhiContext&& val)
-    : Shared<Value>(new Value(std::move(val))) {}
-ValueRef::ValueRef(::phi::CPUContext&& val)
-    : Shared<Value>(new Value(std::move(val))) {}
-ValueRef::ValueRef(::phi::DenseTensor&& val)
-    : Shared<Value>(new Value(std::move(val))) {}
-ValueRef::ValueRef(::phi::MetaTensor&& val)
-    : Shared<Value>(new Value(std::move(val))) {}
 
 const char* Value::type_info() const { return __type_info__; }
 
@@ -67,6 +59,10 @@ void CopyTo(const Value& from, Value* to) {
           to->data = reinterpret_cast<std::vector<int64_t> const&>(arg);
         else if (std::is_same<T, tensor::TensorMap>::value)
           to->data = reinterpret_cast<tensor::TensorMap const&>(arg);
+#ifdef INFRT_WITH_PHI
+        else if (std::is_same<T, ::phi::DenseTensor>::value)
+          to->data = reinterpret_cast<::phi::DenseTensor const&>(arg);
+#endif
         else
           LOG(FATAL) << "Not supported Value copy: " << typeid(T).name();
       },
diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h
index 0ae482349cd07..5b92d183b79da 100644
--- a/paddle/infrt/host_context/value.h
+++ b/paddle/infrt/host_context/value.h
@@ -22,7 +22,9 @@
 
 #include "paddle/infrt/common/object.h"
 #include "paddle/infrt/common/shared.h"
+#include "paddle/infrt/dialect/infrt/common/types.h"
 #include "paddle/infrt/host_context/function.h"
+#include "paddle/infrt/host_context/symbol_table.h"
 #include "paddle/infrt/support/variant.h"
 #include "paddle/infrt/tensor/dense_host_tensor.h"
 #include "paddle/infrt/tensor/dense_tensor_view.h"
@@ -32,6 +34,7 @@
 #ifdef INFRT_WITH_PHI
 #include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/infrt/backends/host/phi_context.h"
+#include "paddle/infrt/tensor/phi/tensor_map.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
@@ -40,7 +43,15 @@
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/meta_tensor.h"
-#endif
+
+#ifdef INFRT_WITH_GPU
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#endif  // INFRT_WITH_GPU
+#ifdef INFRT_WITH_TRT
+#include "paddle/infrt/backends/tensorrt/trt_engine.h"
+#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
+#endif  // INFRT_WITH_TRT
+#endif  // INFRT_WITH_PHI
 
 namespace infrt {
 namespace host_context {
@@ -64,21 +75,33 @@ using ValueVariantType =
             tensor::DenseHostTensor,
             MlirFunctionExecutable*,
             tensor::TensorMap,
+            ::infrt::PrecisionType,
+            ::infrt::LayoutType,
+            ::infrt::TargetType,
 #ifdef INFRT_WITH_PHI
             ::phi::MetaTensor,
             ::phi::DenseTensor,
-            backends::CpuPhiAllocator,
             backends::CpuPhiContext,
+#ifdef INFRT_WITH_GPU
+            backends::GpuPhiContext,
+            ::phi::GPUContext,
+#endif  // INFRT_WITH_GPU
             ::phi::CPUContext,
-            std::vector<const phi::DenseTensor*>,
-            paddle::experimental::ScalarBase<phi::DenseTensor>,
-            paddle::experimental::ScalarArrayBase<phi::DenseTensor>,
-            std::vector<phi::MetaTensor*>,
-            phi::MetaConfig,
+            std::vector<const ::phi::DenseTensor*>,
+            std::vector<::phi::DenseTensor*>,
+            paddle::experimental::ScalarBase<::phi::DenseTensor>,
+            paddle::experimental::ScalarArrayBase<::phi::DenseTensor>,
+            std::vector<::phi::MetaTensor*>,
+            ::phi::MetaConfig,
             paddle::experimental::Backend,
             paddle::experimental::DataLayout,
             paddle::experimental::DataType,
-#endif
+            ::infrt::phi::DenseTensorMap,
+#endif  // INFRT_WITH_PHI
+#ifdef INFRT_WITH_TRT
+            ::infrt::backends::tensorrt::TrtEngine,
+            ::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol,
+#endif  // INFRT_WITH_TRT
             std::vector<int16_t>,
             std::vector<int32_t>,
             std::vector<int64_t>,
@@ -101,6 +124,9 @@ class Value : public common::Object {
   explicit Value(float x) : data(x) {}
   explicit Value(double x) : data(x) {}
   explicit Value(bool x) : data(x) {}
+  explicit Value(::infrt::TargetType x) : data(x) {}
+  explicit Value(::infrt::LayoutType x) : data(x) {}
+  explicit Value(::infrt::PrecisionType x) : data(x) {}
   explicit Value(std::string x) : data(x) {}
   explicit Value(tensor::TensorMap&& x) : data(x) {}
   explicit Value(std::vector<int16_t>&& x) : data(x) {}
@@ -112,11 +138,21 @@ class Value : public common::Object {
   explicit Value(tensor::DenseHostTensor&& x) : data(std::move(x)) {}
   explicit Value(MlirFunctionExecutable* x) : data(x) {}
 #ifdef INFRT_WITH_PHI
-  explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {}
+  explicit Value(::infrt::phi::DenseTensorMap&& x) : data(std::move(x)) {}
   explicit Value(::phi::CPUContext&& x) : data(std::move(x)) {}
+  explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {}
+#ifdef INFRT_WITH_GPU
+  explicit Value(::phi::GPUContext&& x) : data(std::move(x)) {}
+  explicit Value(backends::GpuPhiContext&& x) : data(std::move(x)) {}
+#endif
   explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {}
   explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {}
-  explicit Value(backends::CpuPhiAllocator&& x) : data(std::move(x)) {}
+#ifdef INFRT_WITH_TRT
+  explicit Value(::infrt::backends::tensorrt::TrtEngine&& x)
+      : data(std::move(x)) {}
+  explicit Value(::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol x)
+      : data(x) {}
+#endif  // INFRT_WITH_TRT
 #endif
 
   template <typename T>
@@ -179,10 +215,6 @@ class ValueRef : common::Shared<Value> {
   explicit ValueRef(float val);
   explicit ValueRef(double val);
   explicit ValueRef(bool val);
-  explicit ValueRef(::phi::MetaTensor&& val);
-  explicit ValueRef(backends::CpuPhiContext&& x);
-  explicit ValueRef(::phi::CPUContext&& x);
-  explicit ValueRef(::phi::DenseTensor&& x);
 
   using common::Shared<Value>::get;
   using common::Shared<Value>::Reset;
diff --git a/paddle/infrt/kernel/CMakeLists.txt b/paddle/infrt/kernel/CMakeLists.txt
index f1cbfba1c46b3..f20344f6f6b84 100644
--- a/paddle/infrt/kernel/CMakeLists.txt
+++ b/paddle/infrt/kernel/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(phi)
+add_subdirectory(tensorrt)
 
 core_gather_headers()
 
diff --git a/paddle/infrt/kernel/basic_kernels.cc b/paddle/infrt/kernel/basic_kernels.cc
index 23e50a5ddc874..b186cfcfd2b35 100644
--- a/paddle/infrt/kernel/basic_kernels.cc
+++ b/paddle/infrt/kernel/basic_kernels.cc
@@ -63,24 +63,24 @@ static void PrintString(const std::string &str) {
 void RegisterBasicKernels(host_context::KernelRegistry *registry) {
   RegisterIntBasicKernels(registry);
   RegisterFloatBasicKernels(registry);
-  registry->AddKernel("Infrt.get_string", INFRT_KERNEL(GetString));
-  registry->AddKernel("Infrt.print_string", INFRT_KERNEL(PrintString));
+  registry->AddKernel("infrt.get_string", INFRT_KERNEL(GetString));
+  registry->AddKernel("infrt.print_string", INFRT_KERNEL(PrintString));
 }
 
 void RegisterIntBasicKernels(host_context::KernelRegistry *registry) {
-  registry->AddKernel("Infrt.add.i32", INFRT_KERNEL(add<int32_t>));
-  registry->AddKernel("Infrt.sub.i32", INFRT_KERNEL(sub<int32_t>));
-  registry->AddKernel("Infrt.mul.i32", INFRT_KERNEL(mul<int32_t>));
-  registry->AddKernel("Infrt.div.i32", INFRT_KERNEL(div<int32_t>));
-  registry->AddKernel("Infrt.print.i32", INFRT_KERNEL(print<int32_t>));
+  registry->AddKernel("infrt.add.i32", INFRT_KERNEL(add<int32_t>));
+  registry->AddKernel("infrt.sub.i32", INFRT_KERNEL(sub<int32_t>));
+  registry->AddKernel("infrt.mul.i32", INFRT_KERNEL(mul<int32_t>));
+  registry->AddKernel("infrt.div.i32", INFRT_KERNEL(div<int32_t>));
+  registry->AddKernel("infrt.print.i32", INFRT_KERNEL(print<int32_t>));
 }
 
 void RegisterFloatBasicKernels(host_context::KernelRegistry *registry) {
-  registry->AddKernel("Infrt.add.f32", INFRT_KERNEL(add<float>));
-  registry->AddKernel("Infrt.sub.f32", INFRT_KERNEL(sub<float>));
-  registry->AddKernel("Infrt.mul.f32", INFRT_KERNEL(mul<float>));
-  registry->AddKernel("Infrt.div.f32", INFRT_KERNEL(div<float>));
-  registry->AddKernel("Infrt.print.f32", INFRT_KERNEL(print<float>));
+  registry->AddKernel("infrt.add.f32", INFRT_KERNEL(add<float>));
+  registry->AddKernel("infrt.sub.f32", INFRT_KERNEL(sub<float>));
+  registry->AddKernel("infrt.mul.f32", INFRT_KERNEL(mul<float>));
+  registry->AddKernel("infrt.div.f32", INFRT_KERNEL(div<float>));
+  registry->AddKernel("infrt.print.f32", INFRT_KERNEL(print<float>));
 }
 
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/control_flow_kernels.cc b/paddle/infrt/kernel/control_flow_kernels.cc
index 8b18aca021086..6cc94dbcce077 100644
--- a/paddle/infrt/kernel/control_flow_kernels.cc
+++ b/paddle/infrt/kernel/control_flow_kernels.cc
@@ -37,7 +37,7 @@ static void INFRTCall(
 }
 
 void RegisterControlFlowKernels(host_context::KernelRegistry* registry) {
-  registry->AddKernel("Infrt.call", INFRT_KERNEL(INFRTCall));
+  registry->AddKernel("infrt.call", INFRT_KERNEL(INFRTCall));
 }
 
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/phi/CMakeLists.txt b/paddle/infrt/kernel/phi/CMakeLists.txt
index 7055c0c06d590..15882d23743b0 100644
--- a/paddle/infrt/kernel/phi/CMakeLists.txt
+++ b/paddle/infrt/kernel/phi/CMakeLists.txt
@@ -8,7 +8,6 @@ gather_srcs(infrt_src SRCS
     registry.cc
     dense_tensor_kernels.cc
     context_kernels.cc
-    allocator_kernels.cc
 )
 
 set(infrt_register_phi_kernels_gen_source_file ${CMAKE_SOURCE_DIR}/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc)
diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc
index 3caaf1788e3f8..b27eacf9e522d 100644
--- a/paddle/infrt/kernel/phi/context_kernels.cc
+++ b/paddle/infrt/kernel/phi/context_kernels.cc
@@ -18,13 +18,22 @@ namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::CPUContext CreateCpuContext(
-    infrt::backends::CpuPhiAllocator* allocator) {
-  ::phi::CPUContext context;
-  context.SetAllocator(allocator);
-  context.Init();
+::phi::CPUContext CreateCPUContext() {
+  ::phi::CPUContext ctx{};
+  ctx.Init();
+  ctx.SetAllocator(new backends::CpuPhiAllocator{});
+  return ctx;
+}
+
+#ifdef INFRT_WITH_GPU
+::phi::GPUContext CreateGPUContext() {
+  ::phi::GPUContext context;
+  context.PartialInitWithoutAllocator();
+  context.SetAllocator(new ::infrt::backends::GpuPhiAllocator{});
+  context.PartialInitWithAllocator();
   return context;
 }
+#endif
 
 }  // namespace phi
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/phi/context_kernels.h b/paddle/infrt/kernel/phi/context_kernels.h
index 7f1e7ef6cd356..ae3f76c8fe536 100644
--- a/paddle/infrt/kernel/phi/context_kernels.h
+++ b/paddle/infrt/kernel/phi/context_kernels.h
@@ -16,13 +16,18 @@
 
 #include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/infrt/backends/host/phi_context.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::CPUContext CreateCpuContext(::infrt::backends::CpuPhiAllocator*);
+::phi::CPUContext CreateCPUContext();
+
+#ifdef INFRT_WITH_GPU
+::phi::GPUContext CreateGPUContext();
+#endif
 
 }  // namespace phi
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
index 871336e8762e8..c8b1bd8c9ebd2 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
@@ -13,46 +13,129 @@
 // limitations under the License.
 
 #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
-#include <iostream>
+#include "paddle/infrt/common/string.h"
+#include "paddle/infrt/dialect/phi/data_type.h"
+#include "paddle/infrt/kernel/phi/context_kernels.h"
+#include "paddle/infrt/paddle/model_parser.h"
+#include "paddle/infrt/paddle/scope.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/common/place.h"
+
+#ifdef INFRT_WITH_GPU
+#include <cuda_runtime.h>
+#endif
+
+namespace paddle {
+namespace platform {
+using DeviceContext = ::phi::DeviceContext;
+}  // namespace platform
+namespace framework {
+using LoDTensor = ::phi::DenseTensor;
+void DeserializeFromStream(std::istream& is,
+                           LoDTensor* tensor,
+                           const platform::DeviceContext& dev_ctx);
+}
+}  // namespace paddle
+
 namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::DenseTensor CreateDenseTensorCpuF32Nchw(
-    backends::CpuPhiAllocator* allocator,
+::phi::DenseTensor CreateDenseTensor(
+    const ::phi::CPUContext& context,
+    host_context::Attribute<std::vector<int64_t>> dims,
+    host_context::Attribute<std::vector<int64_t>> lod,
+    host_context::Attribute<::infrt::LayoutType> layout,
+    host_context::Attribute<::infrt::PrecisionType> precision) {
+  return ::phi::DenseTensor(
+      const_cast<::phi::Allocator*>(&context.GetAllocator()),
+      ::phi::DenseTensorMeta(ConvertPrecisionToPhi(precision.get()),
+                             ::phi::make_ddim(dims.get()),
+                             ConvertLayoutToPhi(layout.get()),
+                             {}));
+}
+
+::phi::DenseTensor CreateGPUDenseTensor(
+    const ::phi::GPUContext& context,
     host_context::Attribute<std::vector<int64_t>> dims,
-    host_context::Attribute<std::vector<int64_t>> lod) {
-  return ::phi::DenseTensor(allocator,
-                            ::phi::DenseTensorMeta(::phi::DataType::FLOAT32,
-                                                   ::phi::make_ddim(dims.get()),
-                                                   ::phi::DataLayout::NCHW,
-                                                   {}));
+    host_context::Attribute<std::vector<int64_t>> lod,
+    host_context::Attribute<::infrt::LayoutType> layout,
+    host_context::Attribute<::infrt::PrecisionType> precision) {
+  return ::phi::DenseTensor(
+      const_cast<::phi::Allocator*>(&context.GetAllocator()),
+      ::phi::DenseTensorMeta(ConvertPrecisionToPhi(precision.get()),
+                             ::phi::make_ddim(dims.get()),
+                             ConvertLayoutToPhi(layout.get()),
+                             {}));
 }
 
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
-                        host_context::Attribute<std::vector<float>> values) {
-  auto place = ::phi::CPUPlace();
+                        host_context::Attribute<std::vector<float>> value) {
+  auto place = dense_tensor->place();
   float* a_data = dense_tensor->mutable_data<float>(place);
-  for (int64_t i = 0; i < dense_tensor->numel(); ++i) {
-    a_data[i] = (values.get())[i];
+  if (place.GetType() == ::phi::AllocationType::CPU) {
+    for (int64_t i = 0; i < dense_tensor->numel(); ++i) {
+      a_data[i] = (value.get())[i];
+    }
+  } else if (place.GetType() == ::phi::AllocationType::GPU) {
+#ifdef INFRT_WITH_GPU
+    // TODO(wilber): how to set the stream parameter to copy with stream.
+    cudaMemcpy(a_data,
+               value.get().data(),
+               sizeof(float) * value.get().size(),
+               cudaMemcpyHostToDevice);
+#endif
+  } else {
+    llvm_unreachable("temporarily not support other target.");
   }
 }
 
 void PrintDenseTensor(::phi::DenseTensor* dense_tensor) {
-#define PRINT_META_DATA(PHI_DATATYPE, DTYPE)              \
-  case ::phi::DataType::PHI_DATATYPE: {                   \
-    DTYPE* data = dense_tensor->data<DTYPE>();            \
-    if (dense_tensor->numel() == 0) break;                \
-    std::cout << data[0];                                 \
-    for (int64_t i = 1; i < dense_tensor->numel(); i++) { \
-      std::cout << "," << data[i];                        \
-    }                                                     \
-    break;                                                \
+#ifndef INFRT_WITH_GPU
+#define PRINT_META_DATA(PHI_DATATYPE, DTYPE)                \
+  case ::phi::DataType::PHI_DATATYPE: {                     \
+    auto place = dense_tensor->place();                     \
+    if (place.GetType() == ::phi::AllocationType::CPU) {    \
+      DTYPE* data = dense_tensor->data<DTYPE>();            \
+      if (dense_tensor->numel() == 0) break;                \
+      std::cout << data[0];                                 \
+      for (int64_t i = 1; i < dense_tensor->numel(); i++) { \
+        std::cout << "," << data[i];                        \
+      }                                                     \
+    }                                                       \
+    break;                                                  \
   }
+#else
+#define PRINT_META_DATA(PHI_DATATYPE, DTYPE)                     \
+  case ::phi::DataType::PHI_DATATYPE: {                          \
+    auto place = dense_tensor->place();                          \
+    DTYPE* data = dense_tensor->data<DTYPE>();                   \
+    if (dense_tensor->numel() == 0) break;                       \
+    if (place.GetType() == ::phi::AllocationType::CPU) {         \
+      std::cout << data[0];                                      \
+      for (int64_t i = 1; i < dense_tensor->numel(); i++) {      \
+        std::cout << "," << data[i];                             \
+      }                                                          \
+    } else if (place.GetType() == ::phi::AllocationType::GPU) {  \
+      std::vector<DTYPE> host_data(dense_tensor->numel(), 0);    \
+      cudaMemcpy(host_data.data(),                               \
+                 data,                                           \
+                 sizeof(DTYPE) * dense_tensor->numel(),          \
+                 cudaMemcpyDeviceToHost);                        \
+      std::cout << host_data[0];                                 \
+      for (int64_t i = 1; i < dense_tensor->numel(); i++) {      \
+        std::cout << "," << host_data[i];                        \
+      }                                                          \
+    } else {                                                     \
+      llvm_unreachable("temporarily not support other target."); \
+    }                                                            \
+    break;                                                       \
+  }
+#endif
 
   ::phi::DDim dims = dense_tensor->dims();
   std::cout << "dense_tensor: shape=shape" << dims.to_str() << ","
-            << " values=[";
+            << " value=[";
   switch (dense_tensor->dtype()) {
     PRINT_META_DATA(FLOAT32, float);
     PRINT_META_DATA(INT32, int32_t);
@@ -62,6 +145,89 @@ void PrintDenseTensor(::phi::DenseTensor* dense_tensor) {
   std::cout << "]\n";
 #undef PRINT_META_DATA
 }
+
+::infrt::phi::DenseTensorMap LoadParams(
+    host_context::Attribute<std::string> path) {
+  const auto& file_path = path.get();
+  std::cout << "loading params from: " << file_path << std::endl;
+  ::infrt::phi::DenseTensorMap map;
+
+  const std::string model_path = file_path + "/__model__";
+  auto pb_proto_prog = paddle::LoadProgram(model_path);
+  auto main_block = pb_proto_prog->blocks(0);
+
+  for (auto& var : main_block.vars()) {
+    if (var.name() == "feed" || var.name() == "fetch" || !var.persistable())
+      continue;
+    std::string param_path = file_path + "/" + var.name();
+    std::ifstream param_file(param_path, std::ios::binary);
+    switch (var.type().type()) {
+      case ::paddle::framework::proto::VarType_Type_LOD_TENSOR: {
+        std::unique_ptr<::phi::DenseTensor> tensor{
+            std::make_unique<::phi::DenseTensor>()};
+        ::phi::CPUContext ctx;
+        ::paddle::framework::DeserializeFromStream(
+            param_file, tensor.get(), ctx);
+        map.SetDenseTensor(var.name(), std::move(tensor));
+      } break;
+      default: {
+        LOG(WARNING) << "Var `" << var.name() << "` type `"
+                     << static_cast<int>(var.type().type())
+                     << "` has not been supported now.";
+      }
+    }
+  }
+  return map;
+}
+
+::infrt::phi::DenseTensorMap LoadCombinedParams(
+    host_context::Attribute<std::string> model_path,
+    host_context::Attribute<std::string> params_path) {
+  const auto& model = model_path.get();
+  std::cout << "loading params from: " << model << std::endl;
+  ::infrt::phi::DenseTensorMap map;
+
+  auto pb_proto_prog = paddle::LoadProgram(model);
+  auto main_block = pb_proto_prog->blocks(0);
+
+  std::ifstream param_file(params_path.get(), std::ios::binary);
+
+  std::set<std::string> tmp;
+  for (auto& var : main_block.vars()) {
+    if (var.name() == "feed" || var.name() == "fetch" || !var.persistable()) {
+      continue;
+    }
+    if (var.type().type() ==
+        ::paddle::framework::proto::VarType_Type_LOD_TENSOR) {
+      tmp.emplace(var.name());
+    } else {
+      llvm_unreachable("the tensor type is illegal.");
+    }
+  }
+
+  for (auto& var : tmp) {
+    std::unique_ptr<::phi::DenseTensor> tensor{
+        std::make_unique<::phi::DenseTensor>()};
+    ::phi::CPUContext ctx;
+    ::paddle::framework::DeserializeFromStream(param_file, tensor.get(), ctx);
+    map.SetDenseTensor(var, std::move(tensor));
+  }
+
+  return map;
+}
+
+::phi::DenseTensor TensorMapGetTensor(
+    const ::infrt::phi::DenseTensorMap& map,
+    host_context::Attribute<std::string> name) {
+  auto* tensor = map.GetDenseTensor(name.get());
+  CHECK(tensor);
+  return *tensor;
+}
+
+int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map) {
+  return map.size();
+}
+
 }  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
index 920c0b1c8af42..6cfcc6f91be05 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
@@ -15,22 +15,46 @@
 #pragma once
 
 #include "paddle/infrt/backends/host/phi_allocator.h"
+#include "paddle/infrt/dialect/infrt/common/types.h"
 #include "paddle/infrt/host_context/kernel_utils.h"
+#include "paddle/infrt/tensor/phi/tensor_map.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::DenseTensor CreateDenseTensorCpuF32Nchw(
-    backends::CpuPhiAllocator* allocator,
+::phi::DenseTensor CreateDenseTensor(
+    const ::phi::CPUContext& context,
     host_context::Attribute<std::vector<int64_t>> dims,
-    host_context::Attribute<std::vector<int64_t>> lod);
+    host_context::Attribute<std::vector<int64_t>> lod,
+    host_context::Attribute<::infrt::LayoutType> layout,
+    host_context::Attribute<::infrt::PrecisionType> precision);
+
+::phi::DenseTensor CreateGPUDenseTensor(
+    const ::phi::GPUContext& context,
+    host_context::Attribute<std::vector<int64_t>> dims,
+    host_context::Attribute<std::vector<int64_t>> lod,
+    host_context::Attribute<::infrt::LayoutType> layout,
+    host_context::Attribute<::infrt::PrecisionType> precision);
 
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
                         host_context::Attribute<std::vector<float>> values);
 void PrintDenseTensor(::phi::DenseTensor* dense_tensor);
 
+infrt::phi::DenseTensorMap LoadParams(
+    host_context::Attribute<std::string> path);
+
+::phi::DenseTensor TensorMapGetTensor(
+    const ::infrt::phi::DenseTensorMap& map,
+    host_context::Attribute<std::string> name);
+
+::infrt::phi::DenseTensorMap LoadCombinedParams(
+    host_context::Attribute<std::string> model_path,
+    host_context::Attribute<std::string> params_path);
+
+int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map);
+
 }  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
index 08c2e19deddfe..5a314817c2420 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
@@ -37,15 +37,16 @@ TEST(utils, registry) {
   CHECK_EQ(count, 2U);
 }
 
-class FancyAllocator : public phi::Allocator {
+class FancyAllocator : public ::phi::Allocator {
  public:
-  static void Delete(phi::Allocation* allocation) {
+  static void Delete(::phi::Allocation* allocation) {
     ::operator delete(allocation->ptr());
   }
 
   AllocationPtr Allocate(size_t bytes_size) override {
     void* data = ::operator new(bytes_size);
-    auto* allocation = new phi::Allocation(data, bytes_size, phi::CPUPlace());
+    auto* allocation =
+        new ::phi::Allocation(data, bytes_size, ::phi::CPUPlace());
     return AllocationPtr(allocation, Delete);
   }
 };
@@ -56,20 +57,20 @@ TEST(ElementwiseAdd, launcher_registry) {
   ASSERT_GE(registry.size(), 1UL);
   auto creator = registry.GetKernel("phi_cpu.add.float32.any");
 
-  const phi::DDim dims({1, 2});
-  const phi::DataType dtype{phi::DataType::FLOAT32};
-  const phi::DataLayout layout{phi::DataLayout::NHWC};
-  const phi::LoD lod{};
-  phi::DenseTensorMeta meta(dtype, dims, layout, lod);
+  const ::phi::DDim dims({1, 2});
+  const ::phi::DataType dtype{::phi::DataType::FLOAT32};
+  const ::phi::DataLayout layout{::phi::DataLayout::NHWC};
+  const ::phi::LoD lod{};
+  ::phi::DenseTensorMeta meta(dtype, dims, layout, lod);
 
-  auto fancy_allocator = std::unique_ptr<phi::Allocator>(new FancyAllocator);
+  auto fancy_allocator = std::unique_ptr<::phi::Allocator>(new FancyAllocator);
   auto* alloc = fancy_allocator.get();
 
-  phi::DenseTensor a(alloc, meta);
-  phi::DenseTensor b(alloc, meta);
-  phi::DenseTensor c(alloc, meta);
+  ::phi::DenseTensor a(alloc, meta);
+  ::phi::DenseTensor b(alloc, meta);
+  ::phi::DenseTensor c(alloc, meta);
 
-  auto place = phi::CPUPlace();
+  auto place = ::phi::CPUPlace();
   float* a_data = a.mutable_data<float>(place);
   float* b_data = b.mutable_data<float>(place);
   float* c_data = c.mutable_data<float>(place);
@@ -78,7 +79,7 @@ TEST(ElementwiseAdd, launcher_registry) {
     b_data[i] = 2.f;
   }
 
-  phi::CPUContext context;
+  ::phi::CPUContext context;
   context.SetAllocator(alloc);
   context.Init();
 
diff --git a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
index 165f7f7c94377..75e3ebbf00ca5 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
@@ -24,7 +24,8 @@ void InferShapedKernelLauncher::CreateKernelFrameForInferShape(
        frame->GetValues(1, frame->GetNumElements() - 1)) {
     // TODO(Superjomn) To extend this.
     if (value->is_type<::phi::DenseTensor>()) {
-      values.emplace_back(::phi::MetaTensor{&value->get<::phi::DenseTensor>()});
+      values.emplace_back(new host_context::Value{
+          ::phi::MetaTensor{&value->get<::phi::DenseTensor>()}});
       infershape_kernel_frame_builder.AddArgument(values.back().get());
     } else {
       infershape_kernel_frame_builder.AddArgument(value);
diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc
index cb09275c170d8..08683d7cb66ad 100644
--- a/paddle/infrt/kernel/phi/registry.cc
+++ b/paddle/infrt/kernel/phi/registry.cc
@@ -19,7 +19,6 @@
 
 #include "paddle/infrt/host_context/kernel_registry.h"
 #include "paddle/infrt/host_context/kernel_utils.h"
-#include "paddle/infrt/kernel/phi/allocator_kernels.h"
 #include "paddle/infrt/kernel/phi/context_kernels.h"
 #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
 #include "paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h"
@@ -33,17 +32,40 @@ namespace infrt {
 namespace kernel {
 
 void RegisterPhiKernels(host_context::KernelRegistry* registry) {
-  registry->AddKernel("phi_dt.create_allocator.cpu",
-                      INFRT_KERNEL(infrt::kernel::phi::CreateCpuAllocator));
   registry->AddKernel("phi_dt.create_context.cpu",
-                      INFRT_KERNEL(infrt::kernel::phi::CreateCpuContext));
-  registry->AddKernel(
-      "phi_dt.create_dense_tensor.cpu.f32.nchw",
-      INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensorCpuF32Nchw));
-  registry->AddKernel("phi_dt.fill_dense_tensor.f32",
-                      INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32));
+                      INFRT_KERNEL(infrt::kernel::phi::CreateCPUContext));
+  registry->AddKernelWithAttrs(
+      "phi_dt.create_dense_tensor.cpu",
+      INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensor),
+      {"dims", "lod", "layout", "precision"});
+  registry->AddKernelWithAttrs(
+      "phi_dt.fill_dense_tensor.f32",
+      INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32),
+      {"value"});
   registry->AddKernel("phi_dt.print_tensor",
                       INFRT_KERNEL(infrt::kernel::phi::PrintDenseTensor));
+
+#ifdef INFRT_WITH_GPU
+  registry->AddKernel("phi_dt.create_context.gpu",
+                      INFRT_KERNEL(infrt::kernel::phi::CreateGPUContext));
+  registry->AddKernelWithAttrs(
+      "phi_dt.create_dense_tensor.gpu",
+      INFRT_KERNEL(infrt::kernel::phi::CreateGPUDenseTensor),
+      {"dims", "lod", "layout", "precision"});
+#endif
+  registry->AddKernelWithAttrs("phi_dt.load_params",
+                               INFRT_KERNEL(infrt::kernel::phi::LoadParams),
+                               {"path"});
+  registry->AddKernelWithAttrs(
+      "phi_dt.load_combined_params",
+      INFRT_KERNEL(infrt::kernel::phi::LoadCombinedParams),
+      {"model_path", "params_path"});
+  registry->AddKernelWithAttrs(
+      "phi_dt.tensor_map_get_tensor",
+      INFRT_KERNEL(infrt::kernel::phi::TensorMapGetTensor),
+      {"name"});
+  registry->AddKernel("phi_dt.tensor_map_get_size",
+                      INFRT_KERNEL(infrt::kernel::phi::TensorMapGetSize));
 }
 
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc
index 9de1350e97d1a..407ae16c19c49 100644
--- a/paddle/infrt/kernel/tensor_kernels.cc
+++ b/paddle/infrt/kernel/tensor_kernels.cc
@@ -25,6 +25,10 @@
 #include "paddle/infrt/tensor/tensor_map.h"
 #include "paddle/infrt/tensor/tensor_shape.h"
 
+#ifdef INFRT_WITH_PHI
+#include "paddle/phi/core/dense_tensor.h"
+#endif
+
 namespace infrt {
 namespace kernel {
 using namespace host_context;  // NOLINT
@@ -49,8 +53,8 @@ void FillTensorWithConstant(Attribute<T> v, DenseHostTensor *tensor) {
   MutableDTArrayView<T>(tensor).Fill(v.get());
 }
 
-TensorMap LoadParams(const std::string &path) {
-  return *(infrt::tensor::LoadParams(path));
+TensorMap LoadParams(Attribute<std::string> path) {
+  return *(infrt::tensor::LoadParams(path.get()));
 }
 
 DenseHostTensor TensorMapGetTensor(TensorMap map, Attribute<std::string> name) {
@@ -62,6 +66,20 @@ DenseHostTensor TensorMapGetTensor(TensorMap map, Attribute<std::string> name) {
 
 int32_t TensorMapGetSize(TensorMap map) { return map.size(); }
 
+// TODO(wilber): Maybe we should place TensorList type in dt dialect.
+#ifdef INFRT_WITH_PHI
+::phi::DenseTensor TensorListGetTensor(std::vector<::phi::DenseTensor *> list,
+                                       Attribute<int32_t> idx) {
+  CHECK_LT(idx.get(), static_cast<int>(list.size()))
+      << "idx should less than list size";
+  return *list[idx.get()];
+}
+
+int32_t TensorListGetSize(const std::vector<::phi::DenseTensor *> &list) {
+  return list.size();
+}
+#endif
+
 DenseHostTensor ShallowCopyTensor(DenseHostTensor v) { return v; }
 
 template <typename T>
@@ -111,9 +129,9 @@ void NaiveMatmul(const DenseHostTensor &x,
 /// ===== Kernel end ====
 
 void RegisterTensorKernels(host_context::KernelRegistry *registry) {
-  registry->AddKernel("dt.create_uninit_tensor.f32",
-                      INFRT_KERNEL(CreateUninitTensor<float>));
-  registry->AddKernelAttrNameList("dt.create_uninit_tensor.f32", {"shape"});
+  registry->AddKernelWithAttrs("dt.create_uninit_tensor.f32",
+                               INFRT_KERNEL(CreateUninitTensor<float>),
+                               {"shape"});
   registry->AddKernel("dt.print_tensor", INFRT_KERNEL(PrintTensor));
   registry->AddKernel("dt.fill_tensor_with_constant.f32",
                       INFRT_KERNEL(FillTensorWithConstant<float>));
@@ -126,6 +144,14 @@ void RegisterTensorKernels(host_context::KernelRegistry *registry) {
                       INFRT_KERNEL(TensorMapGetTensor));
   registry->AddKernel("dt.tensor_map_get_size", INFRT_KERNEL(TensorMapGetSize));
 
+// TensorList related methods.
+#ifdef INFRT_WITH_PHI
+  registry->AddKernelWithAttrs(
+      "dt.tensor_list_get_tensor", INFRT_KERNEL(TensorListGetTensor), {"id"});
+  registry->AddKernel("dt.tensor_list_get_size",
+                      INFRT_KERNEL(TensorListGetSize));
+#endif
+
   registry->AddKernel("dt.shallow_copy_tensor",
                       INFRT_KERNEL(ShallowCopyTensor));
 
diff --git a/paddle/infrt/kernel/tensorrt/CMakeLists.txt b/paddle/infrt/kernel/tensorrt/CMakeLists.txt
new file mode 100644
index 0000000000000..cd35fccbe2aa3
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/CMakeLists.txt
@@ -0,0 +1,10 @@
+if (NOT (INFRT_WITH_PHI AND INFRT_WITH_GPU AND INFRT_WITH_TRT))
+  return()
+endif()
+
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    registry.cc
+    trt_kernels.cc
+)
diff --git a/paddle/infrt/kernel/tensorrt/registry.cc b/paddle/infrt/kernel/tensorrt/registry.cc
new file mode 100644
index 0000000000000..a37e3c0f7f278
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/registry.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/kernel/tensorrt/registry.h"
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
+
+namespace infrt {
+namespace kernel {
+
+void RegisterTrtKernels(host_context::KernelRegistry* registry) {
+  registry->AddKernel("trt.create_engine",
+                      INFRT_KERNEL(tensorrt::CreateTrtEngine));
+  registry->AddKernel("trt.inspect_engine",
+                      INFRT_KERNEL(tensorrt::PrintTrtLayer));
+  registry->AddKernel("trt.compute", INFRT_KERNEL(tensorrt::TrtEngineCompute));
+}
+
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensorrt/registry.h b/paddle/infrt/kernel/tensorrt/registry.h
new file mode 100644
index 0000000000000..762329ca61d02
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/registry.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+namespace infrt {
+namespace host_context {
+
+struct KernelRegistry;
+
+}  // namespace host_context
+}  // namespace infrt
+
+namespace infrt {
+namespace kernel {
+
+/**
+ * Register all the trt kernels to registry.
+ */
+void RegisterTrtKernels(host_context::KernelRegistry* registry);
+
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensorrt/trt_helper.h b/paddle/infrt/kernel/tensorrt/trt_helper.h
new file mode 100644
index 0000000000000..96122bffacdb2
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/trt_helper.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <NvInfer.h>
+#include <NvInferRuntime.h>
+#include <NvInferRuntimeCommon.h>
+
+#include "glog/logging.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace infrt {
+namespace kernel {
+namespace tensorrt {
+
+static nvinfer1::DataType TensorTypeToWeightType(phi::DataType tensor_type) {
+  switch (tensor_type) {
+    case phi::DataType::FLOAT32:
+      return nvinfer1::DataType::kFLOAT;
+    case phi::DataType::INT32:
+      return nvinfer1::DataType::kINT32;
+    case phi::DataType::FLOAT16:
+      return nvinfer1::DataType::kHALF;
+    default:
+      llvm_unreachable("should not reach here");
+  }
+}
+
+static nvinfer1::Dims ArrayAttrToNvDims(const mlir::ArrayAttr& int_array_attr) {
+  nvinfer1::Dims dims;
+  dims.nbDims = int_array_attr.size();
+  CHECK(!int_array_attr.empty());
+  CHECK(int_array_attr[0].getType().isIntOrIndex());
+  for (int i = 0; i < dims.nbDims; ++i) {
+    dims.d[i] = int_array_attr[i].cast<mlir::IntegerAttr>().getInt();
+  }
+  return dims;
+}
+
+static nvinfer1::Weights TensorToWeights(phi::DenseTensor* tensor) {
+  CHECK_NOTNULL(tensor);
+  nvinfer1::Weights ret;
+  ret.type = TensorTypeToWeightType(tensor->dtype());
+  ret.count = tensor->numel();
+  ret.values = tensor->data();
+  return ret;
+}
+
+}  // namespace tensorrt
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
new file mode 100644
index 0000000000000..aa7609092b82c
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
@@ -0,0 +1,182 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
+#include <string>
+#include "NvInfer.h"
+#include "NvInferRuntime.h"
+#include "NvInferRuntimeCommon.h"
+#include "glog/logging.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+
+#include "paddle/infrt/kernel/tensorrt/trt_helper.h"
+#include "paddle/infrt/kernel/tensorrt/trt_layers.h"
+
+#include "paddle/infrt/backends/tensorrt/trt_engine.h"
+#include "paddle/infrt/backends/tensorrt/trt_options.h"
+#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
+#include "paddle/infrt/host_context/symbol_table.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace infrt {
+namespace kernel {
+namespace tensorrt {
+
+::infrt::backends::tensorrt::TrtEngine CreateTrtEngine(
+    MlirOperationWithInfrtSymbol create_engine_op) {
+  // TODO(wilber): The device_id needs to get from mlir.
+  int device_id = 0;
+  backends::tensorrt::TrtEngine engine(device_id);
+
+  auto* builder = engine.GetTrtBuilder();
+  // TODO(wilber): How to process weights?
+  backends::tensorrt::TrtUniquePtr<nvinfer1::INetworkDefinition> network;
+  // TODO(wilber): static_shape or dynamic_shape network? The code is just
+  // static_shape test.
+  network.reset(builder->createNetworkV2(0));
+
+  // TODO(wilber): The build option shoule be fiiled from mlir info.
+  backends::tensorrt::BuildOptions options;
+  options.max_batch = 4;
+  options.workspace = 1024;
+
+  // Parse mlir Region which only has one block.
+  mlir::Operation& operation = *create_engine_op.operation;
+  auto* symbol_table = create_engine_op.symbol_table;
+  CHECK_NOTNULL(symbol_table);
+
+  unsigned int num_regions = operation.getNumRegions();
+  CHECK_EQ(num_regions, 1U) << "only support one region case.";
+  auto& region = operation.getRegion(0);
+  auto& block = region.getBlocks().front();
+
+  std::unordered_map<std::string, phi::DenseTensor*> trt_bind_inputs;
+  ValueToITensorMap value_to_trt_tensor_map;
+  ValueToTensorMap value_to_tensor_map;
+
+  for (auto index_operand : llvm::enumerate(operation.getOperands())) {
+    mlir::Value operand = index_operand.value();
+    size_t idx = index_operand.index();
+
+    const std::string input_name = "input_" + std::to_string(idx);
+    auto* v = symbol_table->GetValue(std::to_string(idx));
+    CHECK_NOTNULL(v);
+    auto* t = &v->get<phi::DenseTensor>();
+    value_to_tensor_map[operand] = t;
+
+    // TODO(wilber): get input info from mlir.
+
+    // TODO(wilber): input dims, now only support static_shape, and just remove
+    // the first dimension. If the first dim is not -1, maybe we can pass the
+    // origin dims.
+
+    // TODO(wilber): now only suppot float input.
+
+    if (operand.isa<mlir::BlockArgument>()) {
+      // TODO(wilber): A trick: the weights are CPU tensor and inputs are GPU
+      // tensor, so we treat all GPU tensors as inputs to trt.
+      if (t->place().GetType() == phi::AllocationType::GPU) {
+        trt_bind_inputs[input_name] = t;
+        nvinfer1::Dims dims;
+        dims.nbDims = t->dims().size() - 1;
+        for (int i = 0; i < dims.nbDims; ++i) {
+          dims.d[i] = t->dims()[i + 1];
+        }
+        auto* in = network->addInput(
+            input_name.c_str(), nvinfer1::DataType::kFLOAT, dims);
+        value_to_trt_tensor_map[operand] = in;
+      }
+    } else {
+      // TODO(wilber): Replace with the op name that generates the weights.
+      if (operand.getDefiningOp()->getName().getStringRef() !=
+          "phi_dt.create_dense_tensor.cpu") {
+        trt_bind_inputs[input_name] = t;
+        nvinfer1::Dims dims;
+        dims.nbDims = t->dims().size() - 1;
+        for (int i = 0; i < dims.nbDims; ++i) {
+          dims.d[i] = t->dims()[i + 1];
+        }
+        auto* in = network->addInput(
+            input_name.c_str(), nvinfer1::DataType::kFLOAT, dims);
+        value_to_trt_tensor_map[operand] = in;
+      }
+    }
+  }
+
+  // TODO(wilber): Find a way to add layer.
+  for (auto& operation : block.without_terminator()) {
+    if (trt::ActivationOp op = llvm::dyn_cast<trt::ActivationOp>(operation)) {
+      ActivationFunc(
+          op, network.get(), value_to_trt_tensor_map, value_to_tensor_map);
+    } else if (trt::FullyConnectedOp op =
+                   llvm::dyn_cast<trt::FullyConnectedOp>(operation)) {
+      FcFunc(op, network.get(), value_to_trt_tensor_map, value_to_tensor_map);
+    } else if (trt::ConvolutionOp op =
+                   llvm::dyn_cast<trt::ConvolutionOp>(operation)) {
+      ConvFunc(op, network.get(), value_to_trt_tensor_map, value_to_tensor_map);
+    } else {
+      CHECK(false) << "not supported operation.";
+    }
+  }
+
+  for (auto index_operand :
+       llvm::enumerate(block.getTerminator()->getOperands())) {
+    mlir::Value arg = index_operand.value();
+    CHECK(value_to_trt_tensor_map.count(arg));
+    // TODO(wilber): A trick that we name trt output tensor's name as output_0,
+    // output_1, ...
+    value_to_trt_tensor_map[arg]->setName(
+        ("output_" + std::to_string(index_operand.index())).c_str());
+    network->markOutput(*value_to_trt_tensor_map[arg]);
+  }
+  for (int i = 0; i < network->getNbOutputs(); ++i) {
+    engine.PrepareOutputHandle(network->getOutput(i)->getName());
+  }
+
+  VLOG(3) << "trt engine build start.";
+  engine.Build(std::move(network), options);
+  VLOG(3) << "trt engine build done.";
+
+  // TODO(wilber): get inference options from mlir.
+  backends::tensorrt::InferenceOptions inference_options;
+  inference_options.batch = 1;
+  // TODO(wilber): bind trt input/output tensors.
+  engine.SetUpInference(inference_options, trt_bind_inputs);
+  return engine;
+}
+
+void PrintTrtLayer(backends::tensorrt::TrtEngine* engine) {
+  engine->GetEngineInfo();
+}
+
+std::vector<phi::DenseTensor*> TrtEngineCompute(
+    backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context) {
+  engine->Run(context);
+  std::vector<phi::DenseTensor*> res;
+  for (size_t i = 0; i < engine->GetOutputNum(); ++i) {
+    res.push_back(engine->GetOutput("output_" + std::to_string(i)));
+  }
+  return res;
+}
+
+}  // namespace tensorrt
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.h b/paddle/infrt/kernel/tensorrt/trt_kernels.h
new file mode 100644
index 0000000000000..546ee9dc78852
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/trt_kernels.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <tuple>
+#include <utility>
+
+#include "mlir/IR/Operation.h"
+
+#include "paddle/infrt/backends/tensorrt/trt_engine.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+
+namespace infrt {
+namespace host_context {
+class SymbolTable;
+}  // namespace host_context
+
+namespace kernel {
+namespace tensorrt {
+
+struct MlirOperationWithInfrtSymbol {
+  mlir::Operation* operation;
+  ::infrt::host_context::SymbolTable* symbol_table;
+};
+
+::infrt::backends::tensorrt::TrtEngine CreateTrtEngine(
+    MlirOperationWithInfrtSymbol engine_op);
+
+void PrintTrtLayer(backends::tensorrt::TrtEngine* engine);
+
+std::vector<phi::DenseTensor*> TrtEngineCompute(
+    backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context);
+
+}  // namespace tensorrt
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensorrt/trt_layers.h b/paddle/infrt/kernel/tensorrt/trt_layers.h
new file mode 100644
index 0000000000000..19e20c170ec83
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/trt_layers.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <NvInfer.h>
+#include <mlir/IR/Operation.h>
+
+#include <string>
+
+#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
+#include "paddle/infrt/kernel/tensorrt/trt_helper.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace infrt {
+namespace kernel {
+namespace tensorrt {
+
+using ValueToTensorMap = llvm::DenseMap<mlir::Value, phi::DenseTensor*>;
+using ValueToITensorMap = llvm::DenseMap<mlir::Value, nvinfer1::ITensor*>;
+
+inline void ActivationFunc(
+    trt::ActivationOp& act_op,  // NOLINT
+    nvinfer1::INetworkDefinition* network,
+    ValueToITensorMap& value_to_trt_tensor_map,  // NOLINT
+    ValueToTensorMap& value_to_tensor_map) {     // NOLINT
+  auto in_arg = act_op.getOperand();
+  CHECK(value_to_trt_tensor_map.count(in_arg))
+      << "value_to_trt_tensor_map not has in_arg.";
+
+  nvinfer1::ActivationType act_type =
+      static_cast<nvinfer1::ActivationType>(act_op.activation_type());
+  auto* act_layer =
+      network->addActivation(*value_to_trt_tensor_map[in_arg], act_type);
+  act_layer->setAlpha(act_op.alpha().convertToFloat());
+  act_layer->setBeta(act_op.beta().convertToFloat());
+  for (size_t i = 0; i < act_op->getNumResults(); ++i) {
+    nvinfer1::ITensor* act_out_tensor = act_layer->getOutput(i);
+    mlir::Value act_out = act_op->getResult(i);
+    value_to_trt_tensor_map[act_out] = act_out_tensor;
+  }
+}
+
+inline void ConvFunc(trt::ConvolutionOp& op,  // NOLINT
+                     nvinfer1::INetworkDefinition* network,
+                     ValueToITensorMap& value_to_trt_tensor_map,  // NOLINT
+                     ValueToTensorMap& value_to_tensor_map) {     // NOLINT
+  mlir::Value input_tensor_repr = op.input_tensor();
+  int out_channel_num = op.out_channel_num();
+  auto size_attrs = op.kernel_size();
+  nvinfer1::Dims dims = ArrayAttrToNvDims(size_attrs);
+  auto kernel_weights =
+      TensorToWeights(value_to_tensor_map[op.kernel_weights()]);
+  auto bias_weights = TensorToWeights(value_to_tensor_map[op.bias_weights()]);
+
+  auto* layer =
+      network->addConvolutionNd(*value_to_trt_tensor_map[input_tensor_repr],
+                                out_channel_num,
+                                dims,
+                                kernel_weights,
+                                bias_weights);
+  CHECK_NOTNULL(layer);
+  mlir::Value out_repr = op.output_tensor();
+  nvinfer1::ITensor* out_tensor = layer->getOutput(0);
+  value_to_trt_tensor_map[out_repr] = out_tensor;
+}
+
+inline void FcFunc(trt::FullyConnectedOp& op,  // NOLINT
+                   nvinfer1::INetworkDefinition* network,
+                   ValueToITensorMap& value_to_trt_tensor_map,  // NOLINT
+                   ValueToTensorMap& value_to_tensor_map) {     // NOLINT
+  mlir::Value input_tensor_repr = op.input_tensor();
+  CHECK(value_to_trt_tensor_map.count(input_tensor_repr));
+
+  auto kernel_weights =
+      TensorToWeights(value_to_tensor_map[op.kernel_weights()]);
+  auto bias_weights = TensorToWeights(value_to_tensor_map[op.bias_weights()]);
+
+  int out_channel_num = op.out_channel_num();
+  auto* layer =
+      network->addFullyConnected(*value_to_trt_tensor_map[input_tensor_repr],
+                                 out_channel_num,
+                                 kernel_weights,
+                                 bias_weights);
+
+  mlir::Value out_repr = op.output_tensor();
+  nvinfer1::ITensor* out_tensor = layer->getOutput(0);
+  value_to_trt_tensor_map[out_repr] = out_tensor;
+}
+}  // namespace tensorrt
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/test_kernels.cc b/paddle/infrt/kernel/test_kernels.cc
index d15bbe221f91a..bcf475d1bc09d 100644
--- a/paddle/infrt/kernel/test_kernels.cc
+++ b/paddle/infrt/kernel/test_kernels.cc
@@ -193,7 +193,7 @@ tensor::DenseHostTensor ShadowCopyTensor(tensor::DenseHostTensor src) {
 }
 
 void RegisterTestKernels(host_context::KernelRegistry *registry) {
-  registry->AddKernel("Infrt.benchmark", INFRT_KERNEL(benchmark));
+  registry->AddKernel("infrt.benchmark", INFRT_KERNEL(benchmark));
   registry->AddKernel("Infrt.test.shadow_copy_tensor",
                       INFRT_KERNEL(ShadowCopyTensor));
 }
diff --git a/paddle/infrt/tensor/CMakeLists.txt b/paddle/infrt/tensor/CMakeLists.txt
index 95b2e8f683926..95d4090a9a3f7 100644
--- a/paddle/infrt/tensor/CMakeLists.txt
+++ b/paddle/infrt/tensor/CMakeLists.txt
@@ -1,5 +1,7 @@
 core_gather_headers()
 
+add_subdirectory(phi)
+
 gather_srcs(infrt_src SRCS
   tensor_map.cc
   tensor_metadata.cc
diff --git a/paddle/infrt/tensor/phi/CMakeLists.txt b/paddle/infrt/tensor/phi/CMakeLists.txt
new file mode 100644
index 0000000000000..97e26661266e9
--- /dev/null
+++ b/paddle/infrt/tensor/phi/CMakeLists.txt
@@ -0,0 +1,3 @@
+gather_srcs(infrt_src SRCS
+  tensor_map.cc
+)
diff --git a/paddle/infrt/tensor/phi/tensor_map.cc b/paddle/infrt/tensor/phi/tensor_map.cc
new file mode 100644
index 0000000000000..7690322aed4a3
--- /dev/null
+++ b/paddle/infrt/tensor/phi/tensor_map.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/tensor/phi/tensor_map.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace infrt {
+namespace phi {
+
+void DenseTensorMap::SetDenseTensor(
+    const std::string& name, std::unique_ptr<::phi::DenseTensor>&& tensor) {
+  std::lock_guard<std::mutex> lock(mu_);
+  auto it = map_.emplace(std::make_pair(name, std::move(tensor)));
+  if (!it.second) {
+    llvm_unreachable("dense tensor map insert failed.");
+  }
+}
+
+::phi::DenseTensor* DenseTensorMap::GetDenseTensor(
+    const std::string& name) const {
+  std::lock_guard<std::mutex> lock(mu_);
+  auto it = map_.find(name);
+  if (it != map_.end()) {
+    return it->second.get();
+  }
+  LOG(WARNING) << "can not find `" << name << "` in the tensor map.";
+  return nullptr;
+}
+
+size_t DenseTensorMap::size() const {
+  std::lock_guard<std::mutex> lock(mu_);
+  return map_.size();
+}
+
+}  // namespace phi
+}  // namespace infrt
diff --git a/paddle/infrt/tensor/phi/tensor_map.h b/paddle/infrt/tensor/phi/tensor_map.h
new file mode 100644
index 0000000000000..1b9fbdd9defc7
--- /dev/null
+++ b/paddle/infrt/tensor/phi/tensor_map.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace infrt {
+namespace phi {
+
+class DenseTensorMap {
+ public:
+  DenseTensorMap() = default;
+  DenseTensorMap(DenseTensorMap&& other) : map_(std::move(other.map_)) {}
+  void SetDenseTensor(const std::string& name,
+                      std::unique_ptr<::phi::DenseTensor>&& tensor);
+  ::phi::DenseTensor* GetDenseTensor(const std::string& name) const;
+  size_t size() const;
+
+ private:
+  mutable std::mutex mu_;
+  std::unordered_map<std::string, std::unique_ptr<::phi::DenseTensor>> map_;
+};
+
+}  // namespace phi
+}  // namespace infrt
diff --git a/paddle/infrt/tests/CMakeLists.txt b/paddle/infrt/tests/CMakeLists.txt
index 5ce6d8673421b..58543a6864258 100644
--- a/paddle/infrt/tests/CMakeLists.txt
+++ b/paddle/infrt/tests/CMakeLists.txt
@@ -1,6 +1,8 @@
+cc_test_tiny(test_abs_model SRCS model/test_abs.cc DEPS infrt ${MLIR_IR_LIBS})
+
 configure_file(lit.cfg.py.in "${CMAKE_SOURCE_DIR}/paddle/infrt/tests/lit.cfg.py")
 
 add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle/infrt/tests --filter-out \"disabled_*\""
-    DEPENDS infrtopt infrtexec phi-ir-exec)
+    DEPENDS infrtopt infrtexec)
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir)
diff --git a/paddle/infrt/tests/dialect/basic.mlir b/paddle/infrt/tests/dialect/basic.mlir
index 2d4d6f2629ec7..f534a3aa44aac 100644
--- a/paddle/infrt/tests/dialect/basic.mlir
+++ b/paddle/infrt/tests/dialect/basic.mlir
@@ -1,33 +1,33 @@
 // RUN: infrtexec -i %s | FileCheck %s
 // CHECK-LABEL: @basic_f32
 func @basic_f32() -> f32 {
-  %v0 = Infrt.constant.f32 1.0
-  %v1 = Infrt.constant.f32 2.0
-  %value = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
 
   // CHECK-NEXT: 3
-  "Infrt.print.f32"(%value) : (f32) -> ()
+  "infrt.print.f32"(%value) : (f32) -> ()
 
-  Infrt.return %value : f32
+  infrt.return %value : f32
 }
 
 /// ================================================================
 /// @caller call the other function @callee
 func @callee.add.f32(%x : f32, %y : f32, %y1 : f32) -> f32 {
-  %z = "Infrt.add.f32"(%x, %y) : (f32, f32) -> f32
-  %z1 = "Infrt.add.f32"(%z, %y1) : (f32, f32) -> f32
-  Infrt.return %z1 : f32
+  %z = "infrt.add.f32"(%x, %y) : (f32, f32) -> f32
+  %z1 = "infrt.add.f32"(%z, %y1) : (f32, f32) -> f32
+  infrt.return %z1 : f32
 }
 
 // CHECK-LABEL: @caller.add.f32
 func @caller.add.f32() -> f32 {
-  %x = Infrt.constant.f32 1.0
-  %y = Infrt.constant.f32 2.0
-  %y1 = Infrt.constant.f32 3.0
-  %z = Infrt.call @callee.add.f32(%x, %y, %y1) : (f32, f32, f32) -> f32
+  %x = infrt.constant.f32 1.0
+  %y = infrt.constant.f32 2.0
+  %y1 = infrt.constant.f32 3.0
+  %z = infrt.call @callee.add.f32(%x, %y, %y1) : (f32, f32, f32) -> f32
 
   // CHECK-NEXT: 6
-  "Infrt.print.f32"(%z) : (f32) -> ()
-  Infrt.return %z : f32
+  "infrt.print.f32"(%z) : (f32) -> ()
+  infrt.return %z : f32
 }
 /// <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
diff --git a/paddle/infrt/tests/dialect/benchmark.mlir b/paddle/infrt/tests/dialect/benchmark.mlir
index 381fd534f6a5a..1a57b43499062 100644
--- a/paddle/infrt/tests/dialect/benchmark.mlir
+++ b/paddle/infrt/tests/dialect/benchmark.mlir
@@ -12,13 +12,13 @@ func @benchmark() {
   // CHECK-LABEL: BM:add.f32:CPU 95%(ns)
   // CHECK-LABEL: BM:add.f32:CPU 99%(ns)
   // CHECK-LABEL: BM:add.f32:CPU utilization(percent)
-  Infrt.benchmark "add.f32"() duration_secs = 1, max_count = 3, num_warmup_runs = 3
+  infrt.benchmark "add.f32"() duration_secs = 1, max_count = 3, num_warmup_runs = 3
   {
-    %0 = Infrt.constant.f32 1.0
-    %1 = Infrt.constant.f32 2.0
-    %res = "Infrt.add.f32"(%0, %1) : (f32, f32) -> f32
-    "Infrt.print.f32"(%res) : (f32) -> ()
-    Infrt.return %res : f32
+    %0 = infrt.constant.f32 1.0
+    %1 = infrt.constant.f32 2.0
+    %res = "infrt.add.f32"(%0, %1) : (f32, f32) -> f32
+    "infrt.print.f32"(%res) : (f32) -> ()
+    infrt.return %res : f32
   }
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/dense_tensor.mlir b/paddle/infrt/tests/dialect/dense_tensor.mlir
index faade62d35063..6dc9904610477 100644
--- a/paddle/infrt/tests/dialect/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/dense_tensor.mlir
@@ -4,14 +4,14 @@ func @dense_shape0() {
   %shape = ts.build_shape [1:i64, 57:i64]
   %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  Infrt.return
+  infrt.return
 }
 
 func @predict(%a: !infrt.dense_tensor<CPU, FP32, NCHW>, %b: !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) {
   %a0 = dt.shallow_copy_tensor %a : !infrt.dense_tensor<CPU, FP32, NCHW> -> !infrt.dense_tensor<CPU, FP32, NCHW>
   %b0 = dt.shallow_copy_tensor %b : !infrt.dense_tensor<CPU, FP32, NCHW> -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  Infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
+  infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
 }
 
 
@@ -19,6 +19,6 @@ func @main() {
   %shape = ts.build_shape [1:i64, 57:i64]
   %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  %b, %c = Infrt.call @predict(%a, %a) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>)
-  Infrt.return
+  %b, %c = infrt.call @predict(%a, %a) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>)
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/disabled_tensor_map.mlir b/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
index 8e2d3bc49b96c..936c8f32c0152 100644
--- a/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
+++ b/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
@@ -1,31 +1,30 @@
 // CHECK-LABEL: @predict
-func @predict(%input:!Infrt.tensor<X86, NCHW, F32>, %map: !Infrt.tensor_map) -> (!Infrt.tensor<X86, NCHW, F32>) {
-  %w = dt.get_param(%map, "create_parameter_0.w_0") -> !Infrt.tensor<X86, NCHW, F32>
-  %bias = dt.get_param(%map, "create_parameter_1.w_0") -> !Infrt.tensor<X86, NCHW, F32>
+func @predict(%input:!infrt.dense_tensor<CPU, FP32, NCHW>, %map: !infrt.dense_tensor_map) -> (!infrt.dense_tensor<CPU, FP32, NCHW>) {
+  %w = dt.get_param(%map, "create_parameter_0.w_0") -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  %bias = dt.get_param(%map, "create_parameter_1.w_0") -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  %out = dt.create_uninit_tensor.f32 [3, 3] -> !Infrt.tensor<X86, NCHW, F32>
+  %out = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
   // fc
-  "external.matmul"(%input, %w, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.sigmoid"(%out, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  //dt.print_tensor (%out : !Infrt.tensor<X86, NCHW, F32>)
+  "external.matmul"(%input, %w, %out) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  //dt.print_tensor (%out : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return %out : !Infrt.tensor<X86, NCHW, F32>
+  infrt.return %out : !infrt.dense_tensor<CPU, FP32, NCHW>
 }
 
 // CHECK-LABEL: @main
 func @main() {
-  %input = dt.create_uninit_tensor.f32 [3, 3] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %input = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%input : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=1.0:f32}
 
-  %path = Infrt.get_string("/Infrt/build/paddle/paddle_1.8_fc_model")
   // CHECK-LABEL: loading params
-  %map = dt.load_params(%path)
+  %map = dt.load_params() {path="/Infrt/build/paddle/paddle_1.8_fc_model"}
 
-  %out = Infrt.call @predict(%input, %map): (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor_map) -> (!Infrt.tensor<X86, NCHW, F32>)
-  dt.print_tensor (%out : !Infrt.tensor<X86, NCHW, F32>)
+  %out = infrt.call @predict(%input, %map): (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor_map) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  dt.print_tensor (%out : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
 
diff --git a/paddle/infrt/tests/dialect/paddle_ops.mlir b/paddle/infrt/tests/dialect/paddle_ops.mlir
index 48ee4b9d725c0..4b80555149364 100644
--- a/paddle/infrt/tests/dialect/paddle_ops.mlir
+++ b/paddle/infrt/tests/dialect/paddle_ops.mlir
@@ -5,5 +5,5 @@ func @ops() {
   %b = pd.feed() {name="input1"}: tensor<?xf32>
   %d = pd.feed() {name="input3"}: !infrt.lod_tensor<3x4x9xf32, 0>
   %c = "pd.matmul"(%a, %b) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/rewrite.mlir b/paddle/infrt/tests/dialect/pd/rewrite.mlir
similarity index 97%
rename from paddle/infrt/tests/dialect/rewrite.mlir
rename to paddle/infrt/tests/dialect/pd/rewrite.mlir
index 9fbb09e22449f..ea0248b9d95d2 100644
--- a/paddle/infrt/tests/dialect/rewrite.mlir
+++ b/paddle/infrt/tests/dialect/pd/rewrite.mlir
@@ -1,4 +1,4 @@
-// RUN: infrtopt --canonicalize %s | FileCheck %s
+// RUN: infrtopt --pd-op-fuse %s | FileCheck %s
 // CHECK-LABEL: @main
 func @main() -> tensor<?xf32> {
   %a = "pd.feed"() {name="input0"} : () -> tensor<?xf32>
diff --git a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
index b2e1cc52be682..b8cb1a5cec2a1 100644
--- a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
@@ -2,14 +2,15 @@
 
 // CHECK-LABEL: @sign_any_float32_execute
 func @sign_any_float32_execute() {
-  %allocator = "phi_dt.create_allocator.cpu" (): () -> !phi.allocator<CPU>
-  %ctx = "phi_dt.create_context.cpu" (%allocator): (!phi.allocator<CPU>) -> !phi.context<CPU>
-  %t = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%allocator) {dims=[1:i64], lod=[1:i64]}: (!phi.allocator<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
+  %t = "phi_dt.create_dense_tensor.cpu" (%ctx) {
+    precision=#infrt.precision<FP32>, 
+    layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
   "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
   %e = "phi_cpu.sign.float32.any"(%ctx, %t) : (!phi.context<CPU>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  // CHECK: dense_tensor: shape=shape[1], values=[1]
+  // CHECK: dense_tensor: shape=shape[1], value=[1]
   "phi_dt.print_tensor" (%e) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
-  Infrt.return
+  infrt.return
 }
 
diff --git a/paddle/infrt/tests/dialect/phi/phi_pass.mlir b/paddle/infrt/tests/dialect/phi/phi_pass.mlir
index 61a66cb3d71a3..47badd97d37db 100644
--- a/paddle/infrt/tests/dialect/phi/phi_pass.mlir
+++ b/paddle/infrt/tests/dialect/phi/phi_pass.mlir
@@ -1,4 +1,5 @@
-// RUN: phi-ir-exec %s
+// RUN: infrtopt -phi-op-convert -infrt-op-fuse %s
+
 // CHECK-LABEL: @ops
 func @ops() {
   %a = pd.feed() {name="input0"} : !infrt.lod_tensor<?xf32,0>
@@ -8,3 +9,10 @@ func @ops() {
   %h = "pd.abs"(%g):(tensor<?xf32>) -> tensor<?xf32>
   "pd.fetch"(%h) {name="output"} :(tensor<?xf32>)->()
 }
+
+// CHECK-LABEL: @op_execute
+func @op_execute(%a:!infrt.lod_tensor<?xf32,0>, %b:!infrt.lod_tensor<?xf32,0>, %c:!infrt.lod_tensor<?xf32,0>)  -> !infrt.lod_tensor<?xf32,0> {
+  %g = "pd.elementwise_add"(%a, %b) {axis=1:si32} : (!infrt.lod_tensor<?xf32,0>, !infrt.lod_tensor<?xf32>) -> tensor<?xf32>
+  %h = "pd.abs"(%g):(tensor<?xf32>) -> tensor<?xf32>
+  "pd.fetch"(%h) {name="output"} :(tensor<?xf32>)->()
+}
diff --git a/paddle/infrt/tests/dialect/phi/phi_test.mlir b/paddle/infrt/tests/dialect/phi/phi_test.mlir
new file mode 100644
index 0000000000000..21ee8ebf0b705
--- /dev/null
+++ b/paddle/infrt/tests/dialect/phi/phi_test.mlir
@@ -0,0 +1,15 @@
+// RUN: infrtexec -i %s
+module  {
+  func @predict(%arg0: !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW> {
+    %2 = "pd.abs"(%arg0) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+    infrt.return %2 : !infrt.dense_tensor<CPU, FP32, NCHW>
+  }
+  func @main() {
+    %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
+    %t = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+    "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+    %2 = infrt.call@predict(%t) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+    phi_dt.print_tensor(%2 : !infrt.dense_tensor<CPU, FP32, NCHW>)
+    infrt.return
+  }
+}
diff --git a/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir b/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir
index 76ae140dd6cbd..47bc1f7833140 100644
--- a/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir
@@ -3,14 +3,14 @@
 func @dense_shape0() {
   %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  Infrt.return
+  infrt.return
 }
 
 func @predict(%a: !infrt.dense_tensor<CPU, FP32, NCHW>, %b: !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) {
   %a0 = dt.shallow_copy_tensor %a : !infrt.dense_tensor<CPU, FP32, NCHW> -> !infrt.dense_tensor<CPU, FP32, NCHW>
   %b0 = dt.shallow_copy_tensor %b : !infrt.dense_tensor<CPU, FP32, NCHW> -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  Infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
+  infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
 }
 
 
@@ -18,6 +18,6 @@ func @main() {
   %shape = ts.build_shape [1:i64, 57:i64]
   %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  %b, %c = Infrt.call @predict(%a, %a) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>)
-  Infrt.return
+  %b, %c = infrt.call @predict(%a, %a) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>)
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir b/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir
index 52b296e06cd36..d6b69fdd595ea 100644
--- a/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir
+++ b/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir
@@ -13,7 +13,7 @@ func @naive_elementwise_add() {
   // CHECK: tensor: shape=shape[2,8], values=[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
   dt.print_tensor (%c : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
 
 // RUN: infrtexec -i %s | FileCheck %s
@@ -31,5 +31,5 @@ func @naive_matmul() {
   // CHECK: tensor: shape=shape[2,4], values=[16, 16, 16, 16, 16, 16, 16, 16]
   dt.print_tensor (%c : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
index 5c1396d47f551..9e3773edd77b0 100644
--- a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
+++ b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
@@ -1,15 +1,41 @@
 // RUN: infrtexec -i %s | FileCheck %s
 
 func @load_tensor_map() {
-  %path = Infrt.get_string("@CMAKE_BINARY_DIR@/multi_fc_model")
-  %map = dt.load_params(%path)
+  %map = dt.load_params(){path="@CMAKE_BINARY_DIR@/multi_fc_model"}
   %size = dt.tensor_map_get_size(%map) -> i32
-  Infrt.print.i32 %size
+  infrt.print.i32 %size
 
   %a = dt.tensor_map_get_tensor(%map) {name="fc_bias"} -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
   // CHECK: tensor: shape=shape[2], values=[0, 0]
   dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
+}
+
+func @load_phi_tensor_map() {
+  %map = phi_dt.load_params(){path="@CMAKE_BINARY_DIR@/multi_fc_model"}
+  %size = phi_dt.tensor_map_get_size(%map) -> i32
+  infrt.print.i32 %size
+
+  %a = phi_dt.tensor_map_get_tensor(%map) {name="fc_bias"} -> !infrt.dense_tensor<CPU, FP32, NCHW>
+
+  // CHECK: dense_tensor: shape=shape[2], value=[0,0]
+  phi_dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
+
+  infrt.return
+}
+
+func @load_combined_phi_tensor_map() {
+  %map = phi_dt.load_combined_params(){model_path="@CMAKE_BINARY_DIR@/multi_fc_model/fc.pdmodel",
+    params_path="@CMAKE_BINARY_DIR@/multi_fc_model/fc.pdiparams"}
+  %size = phi_dt.tensor_map_get_size(%map) -> i32
+  infrt.print.i32 %size
+
+  %a = phi_dt.tensor_map_get_tensor(%map) {name="fc_bias"} -> !infrt.dense_tensor<CPU, FP32, NCHW>
+
+  // CHECK: dense_tensor: shape=shape[2], value=[0,0]
+  phi_dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
+
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir b/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir
index 5623aef71aa2c..09210078b9d7d 100644
--- a/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir
+++ b/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir
@@ -4,5 +4,5 @@ func @build_tensor1() {
   %a = ts.build_shape [1:i64, 57:i64, 92:i64]
   // CHECK: shape[1,57,92]
   ts.print_shape %a
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor/tensor_type.mlir b/paddle/infrt/tests/dialect/tensor/tensor_type.mlir
index e580634055a72..5847d567cf6b4 100644
--- a/paddle/infrt/tests/dialect/tensor/tensor_type.mlir
+++ b/paddle/infrt/tests/dialect/tensor/tensor_type.mlir
@@ -6,5 +6,5 @@ func @test_tensor_type() {
   // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
   dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor_shape.mlir b/paddle/infrt/tests/dialect/tensor_shape.mlir
index 5623aef71aa2c..09210078b9d7d 100644
--- a/paddle/infrt/tests/dialect/tensor_shape.mlir
+++ b/paddle/infrt/tests/dialect/tensor_shape.mlir
@@ -4,5 +4,5 @@ func @build_tensor1() {
   %a = ts.build_shape [1:i64, 57:i64, 92:i64]
   // CHECK: shape[1,57,92]
   ts.print_shape %a
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor_type.mlir b/paddle/infrt/tests/dialect/tensor_type.mlir
index e580634055a72..5847d567cf6b4 100644
--- a/paddle/infrt/tests/dialect/tensor_type.mlir
+++ b/paddle/infrt/tests/dialect/tensor_type.mlir
@@ -6,5 +6,5 @@ func @test_tensor_type() {
   // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
   dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensorrt/disabled_trt.mlir b/paddle/infrt/tests/dialect/tensorrt/disabled_trt.mlir
new file mode 100644
index 0000000000000..ef86dcf1e72a0
--- /dev/null
+++ b/paddle/infrt/tests/dialect/tensorrt/disabled_trt.mlir
@@ -0,0 +1,37 @@
+// RUN: infrtexec -i %s | FileCheck %s
+
+// CHECK-LABEL: @run_trt
+func @run_trt(%0 : !infrt.dense_tensor<GPU, FP32, NCHW>, %ctx : !phi.context<GPU>) {
+  %a = "trt.create_engine"(%0) ({
+    %1 = "trt.Activation"(%0) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    "infrt.return"(%1) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+  }) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !trt.engine
+  "trt.inspect_engine"(%a) {} : (!trt.engine) -> ()
+
+  %res = "trt.compute"(%a, %ctx) {} : (!trt.engine, !phi.context<GPU>) -> (!infrt.tensor_list)
+  %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32)
+  "infrt.print.i32"(%size) {} : (i32) -> ()
+
+  %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  infrt.return
+}
+
+// CHECK-LABEL: @main
+func @main() {
+  %ctx = "phi_dt.create_context.gpu" (): () -> !phi.context<GPU>
+  %t = "phi_dt.create_dense_tensor.gpu" (%ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[1:i64, 3:i64, 1:i64, 1:i64], lod=[1:i64]}: (!phi.context<GPU>) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+
+  "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+  "phi_dt.print_tensor" (%t) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  //%res = 
+  infrt.call @run_trt(%t, %ctx) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !phi.context<GPU>) -> ()
+  //-> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+
+  infrt.return
+}
diff --git a/paddle/infrt/tests/dialect/tensorrt/disabled_trt_conv.mlir b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_conv.mlir
new file mode 100644
index 0000000000000..c67d47415bfb0
--- /dev/null
+++ b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_conv.mlir
@@ -0,0 +1,54 @@
+// RUN: infrtexec -i %s | FileCheck %s
+
+// CHECK-LABEL: @run_trt
+func @run_trt(%input_tensor : !infrt.dense_tensor<GPU, FP32, NCHW>, %kernel_weight : !infrt.dense_tensor<CPU, FP32, NCHW>, %kernel_bias : !infrt.dense_tensor<CPU, FP32, NCHW>, %gpu_ctx : !phi.context<GPU>) {
+  %a = "trt.create_engine"(%input_tensor, %kernel_weight, %kernel_bias) ({
+    %1 = "trt.Activation"(%input_tensor) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    %2 = "trt.Convolution"(%input_tensor, %kernel_weight, %kernel_bias) {out_channel_num = 3 : si32, kernel_size = [3:i32, 3:i32]} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    "infrt.return"(%1, %2) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+  }) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !trt.engine
+  "trt.inspect_engine"(%a) {} : (!trt.engine) -> ()
+
+  %res = "trt.compute"(%a, %gpu_ctx) {} : (!trt.engine, !phi.context<GPU>) -> (!infrt.tensor_list)
+  %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32)
+  "infrt.print.i32"(%size) {} : (i32) -> ()
+
+  %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  %ts1 = "dt.tensor_list_get_tensor"(%res) {id = 1 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.print_tensor" (%ts1) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  infrt.return
+}
+
+// CHECK-LABEL: @main
+func @main() {
+  %gpu_ctx = "phi_dt.create_context.gpu" (): () -> !phi.context<GPU>
+  %cpu_ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
+
+  %input_tensor = "phi_dt.create_dense_tensor.gpu" (%gpu_ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[1:i64, 3:i64, 28:i64, 28:i64], lod=[0:i64]}: (!phi.context<GPU>) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%input_tensor) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+  // "phi_dt.print_tensor" (%input_tensor) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  %kernel_weight = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[3:i64, 3:i64, 3:i64, 3:i64], lod=[0:i64]} : (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%kernel_weight) {value=[1.:f32, 2.:f32, 3.:f32, 4.:f32, 5.:f32, 6.:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  // "phi_dt.print_tensor" (%kernel_weight) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+
+  %kernel_bias = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[3:i64], lod=[0:i64]} : (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%kernel_bias) {value=[1.:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  // "phi_dt.print_tensor" (%kernel_bias) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+
+  infrt.call @run_trt(%input_tensor, %kernel_weight, %kernel_bias, %gpu_ctx) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !phi.context<GPU>) -> ()
+
+  infrt.return
+}
diff --git a/paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir
new file mode 100644
index 0000000000000..78dc4ac1c1093
--- /dev/null
+++ b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir
@@ -0,0 +1,46 @@
+// RUN: infrtexec -i %s | FileCheck %s
+
+// CHECK-LABEL: @main
+func @main() {
+  %ctx = "phi_dt.create_context.gpu" (): () -> !phi.context<GPU>
+  %cpu_ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
+
+  %input_tensor = "phi_dt.create_dense_tensor.gpu" (%ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[1:i64, 3:i64, 1:i64, 1:i64], lod=[1:i64]}: (!phi.context<GPU>) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%input_tensor) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+  //"phi_dt.print_tensor" (%input_tensor) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  %kernel_weight = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[2:i64, 3:i64], lod=[1:i64]} : (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%kernel_weight) {value=[1.:f32, 2.:f32, 3.:f32, 4.:f32, 5.:f32, 6.:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  //"phi_dt.print_tensor" (%kernel_weight) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+
+  %kernel_bias = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[2:i64], lod=[1:i64]} : (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%kernel_bias) {value=[1.:f32, 2.:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  //"phi_dt.print_tensor" (%kernel_bias) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+
+  %engine = "trt.create_engine"(%input_tensor, %kernel_weight, %kernel_bias) ({
+    %1 = "trt.Activation"(%input_tensor) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    %2 = "trt.FullyConnected"(%input_tensor, %kernel_weight, %kernel_bias) {out_channel_num = 2 : si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    "infrt.return"(%1, %2) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+  }) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !trt.engine
+
+  %res = "trt.compute"(%engine, %ctx) {} : (!trt.engine, !phi.context<GPU>) -> (!infrt.tensor_list)
+  %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32)
+  "infrt.print.i32"(%size) {} : (i32) -> ()
+
+  %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  %ts1 = "dt.tensor_list_get_tensor"(%res) {id = 1 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.print_tensor" (%ts1) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  infrt.return
+}
diff --git a/paddle/infrt/tests/dialect/trt_ops.mlir b/paddle/infrt/tests/dialect/trt_ops.mlir
index 49510bc542dc0..7bdf62a277896 100644
--- a/paddle/infrt/tests/dialect/trt_ops.mlir
+++ b/paddle/infrt/tests/dialect/trt_ops.mlir
@@ -1,23 +1,16 @@
 // RUN: trt-exec %s
 // CHECK-LABEL: @main
-func @main() -> tensor<?xf32> {
-  %bias = "pd.feed"() {name="input0"} : () -> tensor<?xf32>
-  %c = "pd.feed"() {name="input1"} : () -> tensor<?xf32>
-  %b1 = "pd.feed"() {name="input2"} : () -> tensor<?xf32>
-  %b2 = "pd.feed"() {name="input3"} : () -> tensor<?xf32>
-  %bias1 = "pd.feed"() {name="input4"} : () -> tensor<?xf32>
-  %bias2 = "pd.feed"() {name="input5"} : () -> tensor<?xf32>
+func @main(%bias:!infrt.dense_tensor<GPU, FP32, NCHW>, %c:!infrt.dense_tensor<GPU, FP32, NCHW>, %b1:!infrt.dense_tensor<GPU, FP32, NCHW>, %b2:!infrt.dense_tensor<GPU, FP32, NCHW>, %bias1:!infrt.dense_tensor<GPU, FP32, NCHW>, %bias2:!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW> {
+  %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+  %e = "pd.relu6"(%d) {} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
 
-  %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %e = "pd.relu6"(%d) {} : (tensor<?xf32>) -> tensor<?xf32>
+  %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+  %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=-1:si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+  %e1 = "pd.relu"(%d1) {} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
 
-  %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %e1 = "pd.relu"(%d1) {} : (tensor<?xf32>) -> tensor<?xf32>
-
-  %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %e2 = "pd.relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
+  %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+  %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+  %e2 = "pd.relu"(%d2) {} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
   
-  "pd.fetch"(%e2) {name="output"} :(tensor<?xf32>)->()
+  infrt.return %e2 : !infrt.dense_tensor<GPU, FP32, NCHW>
 }
diff --git a/paddle/infrt/tests/model/abs_model.py b/paddle/infrt/tests/model/abs_model.py
new file mode 100644
index 0000000000000..dd1632bc9d4d8
--- /dev/null
+++ b/paddle/infrt/tests/model/abs_model.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.nn import Layer
+from paddle.static import InputSpec
+from paddle.jit import to_static
+import sys
+
+
+class AbsNet(paddle.nn.Layer):
+    def __init__(self):
+        super(AbsNet, self).__init__()
+
+    def forward(self, x):
+        x = paddle.abs(x)
+        return x
+
+
+if __name__ == '__main__':
+    # build network
+    model = AbsNet()
+    # save inferencing format model
+    net = to_static(
+        model, input_spec=[InputSpec(
+            shape=[None, 1, 28, 28], name='x')])
+    paddle.jit.save(net, sys.argv[1])
diff --git a/paddle/infrt/tests/model/test_abs.cc b/paddle/infrt/tests/model/test_abs.cc
new file mode 100644
index 0000000000000..49266910dbd27
--- /dev/null
+++ b/paddle/infrt/tests/model/test_abs.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <llvm/Support/CommandLine.h>
+#include <mlir/Pass/PassManager.h>
+#include <iostream>
+#include <string>
+
+#include "llvm/Support/DynamicLibrary.h"
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/mlir_loader.h"
+#include "paddle/infrt/host_context/core_runtime.h"
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
+#include "paddle/infrt/kernel/basic_kernels.h"
+#include "paddle/infrt/kernel/control_flow_kernels.h"
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
+#include "paddle/infrt/kernel/phi/registry.h"
+#include "paddle/infrt/kernel/tensor_kernels.h"
+#include "paddle/infrt/kernel/tensor_shape_kernels.h"
+#include "paddle/infrt/kernel/test_kernels.h"
+
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_utils.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/meta_tensor.h"
+
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+
+#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
+#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
+#include "paddle/infrt/host_context/paddle_mlir.h"
+
+#include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
+
+static llvm::cl::list<std::string> cl_shared_libs(  // NOLINT
+    "shared_libs",
+    llvm::cl::desc("Specify shared library with kernels."),
+    llvm::cl::ZeroOrMore,
+    llvm::cl::MiscFlags::CommaSeparated);
+
+TEST(ABS_MODEL, convert_and_execute) {
+  std::string model_file_name = "./abs.pdmodel";
+  std::string params_file_name = "./abs.pdiparams";
+  // convert model
+  MLIRModelGenImpl myGen;
+  auto module_ = myGen.ImportPaddleModel(model_file_name, params_file_name);
+  module_.dump();
+  // pick kernel
+  mlir::MLIRContext* context = infrt::Global::getMLIRContext();
+  context->allowUnregisteredDialects();
+  context->getOrLoadDialect<mlir::StandardOpsDialect>();
+
+  context->getOrLoadDialect<infrt::InfrtDialect>();
+  context->getOrLoadDialect<infrt::ts::TensorShapeDialect>();
+  context->getOrLoadDialect<infrt::InfrtDialect>();
+  context->getOrLoadDialect<infrt::dt::DTDialect>();
+  context->getOrLoadDialect<infrt::pd::PaddleDialect>();
+
+  context->getOrLoadDialect<infrt::phi::PHIDenseTensorDialect>();
+  context->getOrLoadDialect<infrt::phi::PHICPUKernelDialect>();
+  context->getOrLoadDialect<infrt::phi::PHIGPUKernelDialect>();
+  context->getOrLoadDialect<infrt::phi::PHIDialect>();
+
+  context->loadAllAvailableDialects();
+  mlir::PassManager pm(context);
+
+  mlir::OpPassManager& phi_pass_manager = pm.nest<mlir::FuncOp>();
+  std::vector<infrt::Place> valid_places = {{infrt::TargetType::CPU,
+                                             infrt::PrecisionType::FLOAT32,
+                                             infrt::LayoutType::NCHW}};
+  phi_pass_manager.addPass(infrt::createPhiOpCvtPass(valid_places));
+  phi_pass_manager.addPass(infrt::createInfrtOpFusePass());
+
+  if (mlir::failed(pm.run(module_))) {
+    std::cout << "\npass failed!\n" << std::endl;
+  }
+  module_.dump();
+
+  // executate
+  infrt::host_context::KernelRegistry registry;
+  infrt::kernel::RegisterBasicKernels(&registry);
+  infrt::kernel::RegisterTestKernels(&registry);
+  infrt::kernel::RegisterTensorShapeKernels(&registry);
+  infrt::kernel::RegisterTensorKernels(&registry);
+  infrt::kernel::RegisterControlFlowKernels(&registry);
+  infrt::kernel::RegisterPhiKernels(&registry);
+  infrt::kernel::RegisterInferShapeLaunchers(&registry);
+  // load extra shared library
+  for (const auto& lib_path : cl_shared_libs) {
+    std::string err;
+    llvm::sys::DynamicLibrary dynLib =
+        llvm::sys::DynamicLibrary::getPermanentLibrary(lib_path.c_str(), &err);
+    if (!dynLib.isValid()) {
+      llvm::errs() << "Load shared library failed. Error: " << err << "\n";
+      break;
+    }
+    if (auto reg_sym = dynLib.SearchForAddressOfSymbol("RegisterKernels")) {
+      auto reg_func =
+          reinterpret_cast<void (*)(infrt::host_context::KernelRegistry*)>(
+              reg_sym);
+      reg_func(&registry);
+    } else {
+      llvm::outs() << "Symbol \"RegisterKernels\" not found in \"" << lib_path
+                   << "\". Skip.\n";
+    }
+  }
+  infrt::host_context::TestMlir(module_, &registry);
+}
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 7b074d0ebb76d..04e1bbcc9df42 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -25,8 +25,6 @@ add_subdirectory(tests)
 # make an unity target for compile deps
 set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor)
 get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
-# keep this message for debug, remove it later if needless
-message(STATUS "All standard phi kernels: ${phi_kernels}")
 set(PHI_DEPS ${PHI_DEPS} ${phi_kernels})
 
 cc_library(phi DEPS ${PHI_DEPS})
diff --git a/paddle/phi/api/ext/op_meta_info.h b/paddle/phi/api/ext/op_meta_info.h
index 7601696293a66..88660449b6821 100644
--- a/paddle/phi/api/ext/op_meta_info.h
+++ b/paddle/phi/api/ext/op_meta_info.h
@@ -86,19 +86,28 @@ class PADDLE_API CustomOpKernelContext {
   CustomOpKernelContext() = default;
 
   void EmplaceBackInput(Tensor&& input);
-  void EmplaceBackInputs(std::vector<Tensor>&& inputs);
+  void EmplaceBackInputs(const std::vector<Tensor>& inputs);
   void EmplaceBackOutput(Tensor&& output);
-  void EmplaceBackOutputs(std::vector<Tensor>&& outputs);
+  void EmplaceBackOutputs(const std::vector<Tensor>& outputs);
   void EmplaceBackAttr(paddle::any attr);
-
+  void EmplaceBackAttrs(const std::vector<paddle::any>& attrs) {
+    attrs_ = std::move(attrs);
+  }
   const std::pair<size_t, size_t>& InputRangeAt(size_t idx) const;
   const std::pair<size_t, size_t>& OutputRangeAt(size_t idx) const;
 
   const Tensor& InputAt(size_t idx) const;
   std::vector<Tensor> InputsBetween(size_t start, size_t end) const;
-
+  const std::vector<paddle::any>& Attrs() const { return attrs_; }
+  const std::vector<std::pair<size_t, size_t>>& InputRange() {
+    return input_range_;
+  }
+  const std::vector<std::pair<size_t, size_t>>& OutputRange() {
+    return output_range_;
+  }
   Tensor* MutableOutputAt(size_t idx);
   std::vector<Tensor*> MutableOutputBetweeen(size_t start, size_t end);
+  std::vector<Tensor> OutputsBetweeen(size_t start, size_t end);
   std::vector<Tensor>* AllMutableOutput();
 
   template <typename AttrType>
@@ -552,7 +561,6 @@ class PADDLE_API OpMetaInfo {
   std::vector<std::string> inputs_;
   std::vector<std::string> outputs_;
   std::vector<std::string> attrs_;
-
   // 2. func info
   KernelFunc kernel_fn_{nullptr};
   InferShapeFunc infer_shape_fn_{nullptr};
diff --git a/paddle/phi/api/include/context_pool.h b/paddle/phi/api/include/context_pool.h
new file mode 100644
index 0000000000000..754833a2ddab3
--- /dev/null
+++ b/paddle/phi/api/include/context_pool.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/macros.h"
+#include "paddle/utils/flat_hash_map.h"
+
+namespace phi {
+class DeviceContext;
+class CPUContext;
+class GPUContext;
+}  // namespace phi
+
+namespace paddle {
+namespace experimental {
+
+template <AllocationType T>
+struct DefaultDeviceContextType;
+
+template <>
+struct DefaultDeviceContextType<AllocationType::CPU> {
+  using TYPE = phi::CPUContext;
+};
+
+template <>
+struct DefaultDeviceContextType<AllocationType::GPU> {
+  using TYPE = phi::GPUContext;
+};
+
+/**
+ * The DeviceContextPool here is just a mirror of the DeviceContextPool in
+ * fluid, and does not manage the life cycle of the DeviceContext.
+ * It is mainly used for external custom operator calls and high-performance
+ * C++ APIs.
+ *
+ * Since DeviceContextPool in fluid is a global singleton, it always exists
+ * in program running, so DeviceContextPool here can always access the correct
+ * DeviceContext pointer.
+ *
+ * In order not to depend on the fluid's DeviceContextPool,
+ * the DeviceContextPool here needs to be initialized in the fluid, and cannot
+ * be initialized by itself.
+ */
+class DeviceContextPool {
+ public:
+  static DeviceContextPool& Instance();
+
+  const phi::DeviceContext* Get(const Place& place) const;
+
+  phi::DeviceContext* GetMutable(const Place& place);
+
+  template <AllocationType T>
+  const typename DefaultDeviceContextType<T>::TYPE* Get(
+      const Place& place) const {
+    return reinterpret_cast<const typename DefaultDeviceContextType<T>::TYPE*>(
+        Get(place));
+  }
+
+ private:
+  DeviceContextPool();
+  paddle::flat_hash_map<Place, const phi::DeviceContext*, Place::Hash>
+      context_map_;
+
+  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
+};
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index c268742fa567b..c58ebe69523eb 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -225,6 +225,22 @@ class PADDLE_API Tensor final {
    */
   bool is_selected_rows() const;
 
+  /**
+   * @brief Determine whether tensor is SparseCooTensor
+   *
+   * @return true
+   * @return false
+   */
+  bool is_sparse_coo_tensor() const;
+
+  /**
+   * @brief Determine whether tensor is SparseCsrTensor
+   *
+   * @return true
+   * @return false
+   */
+  bool is_sparse_csr_tensor() const;
+
   /* Part 3: Device and Backend methods */
 
   /**
@@ -324,7 +340,7 @@ class PADDLE_API Tensor final {
    *
    * @return std::shared_ptr<phi::TensorBase>
    */
-  std::shared_ptr<phi::TensorBase> impl() const;
+  const std::shared_ptr<phi::TensorBase>& impl() const;
 
   /**
    * @brief Set the implemention of current Tensor.
@@ -333,6 +349,13 @@ class PADDLE_API Tensor final {
    */
   void set_impl(const std::shared_ptr<phi::TensorBase>& impl);
 
+  /**
+   * @brief Set the implemention of current Tensor.
+   *
+   * @param impl
+   */
+  void set_impl(std::shared_ptr<phi::TensorBase>&& impl);
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   /**
    * @brief Get the stream where the tensor is currently located
@@ -397,7 +420,9 @@ class PADDLE_API Tensor final {
    * @param blocking, Should we copy this in sync way.
    * @return void
    */
-  void copy_(const Tensor& src, const bool blocking);
+  void copy_(const Tensor& src,
+             const phi::Place& target_place,
+             const bool blocking);
   /**
    * @brief Cast datatype from one to another
    *
@@ -472,7 +497,21 @@ class PADDLE_API Tensor final {
    */
   void set_autograd_meta(std::shared_ptr<AbstractAutogradMeta> autograd_meta);
 
-  /* Part 9: Auto generated Tensor methods */
+  /* Part 9: Inplace methods */
+
+  /**
+   * @brief Increase inplace version
+   */
+  void bump_inplace_version();
+
+  /**
+   * @brief Get current inplace version
+   *
+   * @return uint32_t
+   */
+  uint32_t current_inplace_version();
+
+  /* Part 10: Auto generated Tensor methods */
 
  private:
   /**
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 42bf7a8103f83..50c267f653564 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -135,8 +135,9 @@ add_custom_command(
 
 cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor_raw)
 cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi)
+cc_library(context_pool SRCS context_pool.cc DEPS phi_context phi_enforce place)
 
-cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory)
+cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory context_pool)
 cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor)
 cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform)
 cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
@@ -148,4 +149,4 @@ cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw ph
 cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
 cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl)
 
-cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api)
+cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta)
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index e1ebe8c6465cf..0c11e2df65d0d 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -95,12 +95,8 @@ paddle::optional<phi::MetaTensor> MakeMetaTensor(
 /* ------------------ for output ----------------------- */
 
 phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
-  if (!out->initialized()) {
-    auto dense_tensor = std::make_shared<phi::DenseTensor>(
-        phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)),
-        phi::DenseTensorMeta());
-    out->set_impl(dense_tensor);
-    return dense_tensor.get();
+  if (out->impl() == nullptr) {
+    out->set_impl(std::make_shared<phi::DenseTensor>());
   }
   return static_cast<phi::DenseTensor*>(out->impl().get());
 }
@@ -111,9 +107,7 @@ std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
   out->reserve(out_size);
   std::vector<phi::DenseTensor*> results(out_size);
   for (size_t i = 0; i < out_size; ++i) {
-    auto tensor_ptr = std::make_shared<phi::DenseTensor>(
-        phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)),
-        phi::DenseTensorMeta());
+    auto tensor_ptr = std::make_shared<phi::DenseTensor>();
     results[i] = tensor_ptr.get();
     out->emplace_back();
     out->back().set_impl(tensor_ptr);
diff --git a/paddle/phi/api/lib/context_pool.cc b/paddle/phi/api/lib/context_pool.cc
new file mode 100644
index 0000000000000..d1408a88d6ff7
--- /dev/null
+++ b/paddle/phi/api/lib/context_pool.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/api/include/context_pool.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace paddle {
+namespace experimental {
+
+DeviceContextPool& DeviceContextPool::Instance() {
+  static DeviceContextPool g_device_context_pool;
+  return g_device_context_pool;
+}
+
+const phi::DeviceContext* DeviceContextPool::Get(const Place& place) const {
+  auto it = context_map_.find(place);
+  PADDLE_ENFORCE_NE(
+      it,
+      context_map_.end(),
+      phi::errors::NotFound("The DeviceContext of %s does not exists.", place));
+  return it->second;
+}
+
+phi::DeviceContext* DeviceContextPool::GetMutable(const Place& place) {
+  return const_cast<phi::DeviceContext*>(Get(place));
+}
+
+DeviceContextPool::DeviceContextPool() {
+  // We need to make sure that the correct value exists
+  // whenever we get the DeviceContext from DeviceContextPool
+  const auto& device_contexts =
+      paddle::platform::DeviceContextPool::Instance().device_contexts();
+  for (const auto& pair : device_contexts) {
+    // only get CPU and GPU DeviceContext now, add other DeviceContext type
+    // later if needed
+    if (platform::is_cpu_place(pair.first)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+        ||
+        platform::is_gpu_place(pair.first)) {
+#else
+            ) {
+#endif
+      const phi::DeviceContext* dev_ctx = pair.second.get().get();
+      VLOG(3) << "Init phi DeviceContextPool: insert {" << pair.first << ", "
+              << dev_ctx << "}";
+      context_map_[pair.first] = dev_ctx;
+    }
+  }
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index 79b8ac6d0b835..8bf5f3b481a0e 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -167,10 +167,7 @@ phi::DenseTensor TransformData(const phi::DenseTensor& tensor,
 
   if (NeedTransformPlace(
           out.place(), target_args_def.backend, transform_flag)) {
-    phi::DenseTensor result(
-        phi::make_intrusive<paddle::experimental::SharedStorage>(
-            phi::TransToPhiPlace(target_args_def.backend)),
-        {out.dtype(), out.dims(), out.layout()});
+    phi::DenseTensor result;
     framework::TransDataDevice(
         out, phi::TransToPhiPlace(target_args_def.backend), &result);
     out = result;
@@ -183,6 +180,7 @@ std::shared_ptr<phi::DenseTensor> PrepareData(
     const phi::TensorArgDef& target_args_def,
     const TransformFlag& transform_flag) {
   const auto& tensor_in = input.impl();
+  VLOG(6) << tensor_in->dtype() << "\t" << target_args_def.dtype;
   if (!transform_flag.NeedTransform() || !tensor_in->initialized() ||
       (!NeedTransformPlace(
            tensor_in->place(), target_args_def.backend, transform_flag) &&
@@ -190,14 +188,14 @@ std::shared_ptr<phi::DenseTensor> PrepareData(
            tensor_in->dtype(), target_args_def.dtype, transform_flag) &&
        !NeedTransformLayout(
            tensor_in->layout(), target_args_def.layout, transform_flag))) {
-    return std::dynamic_pointer_cast<phi::DenseTensor>(tensor_in);
+    return std::static_pointer_cast<phi::DenseTensor>(tensor_in);
   }
 
   phi::DenseTensor out =
       TransformData(*(static_cast<phi::DenseTensor*>(tensor_in.get())),
                     target_args_def,
                     transform_flag);
-  return std::make_shared<phi::DenseTensor>(out);
+  return std::make_shared<phi::DenseTensor>(std::move(out));
 }
 
 std::shared_ptr<phi::DenseTensor> PrepareData(
diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc
index 0e3ca1af4967c..5e334b9b727dc 100644
--- a/paddle/phi/api/lib/kernel_dispatch.cc
+++ b/paddle/phi/api/lib/kernel_dispatch.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 
+#include "paddle/phi/api/include/context_pool.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 
 namespace paddle {
@@ -52,8 +53,8 @@ std::size_t CountLeadingZeros(uint64_t val) {
 }  // namespace detail
 
 phi::DeviceContext* GetDeviceContextByBackend(phi::Backend backend) {
-  auto& pool = paddle::platform::DeviceContextPool::Instance();
-  return pool.Get(phi::TransToPhiPlace(backend));
+  auto& pool = paddle::experimental::DeviceContextPool::Instance();
+  return pool.GetMutable(phi::TransToPhiPlace(backend));
 }
 
 DataType ParseDataType(DataType dtype) { return dtype; }
diff --git a/paddle/phi/api/lib/op_meta_info.cc b/paddle/phi/api/lib/op_meta_info.cc
index 51d51c954de81..14dba664c41b3 100644
--- a/paddle/phi/api/lib/op_meta_info.cc
+++ b/paddle/phi/api/lib/op_meta_info.cc
@@ -51,7 +51,8 @@ void CustomOpKernelContext::EmplaceBackInput(Tensor&& input) {
   input_range_.emplace_back(std::make_pair(index, index + 1));
 }
 
-void CustomOpKernelContext::EmplaceBackInputs(std::vector<Tensor>&& inputs) {
+void CustomOpKernelContext::EmplaceBackInputs(
+    const std::vector<Tensor>& inputs) {
   size_t index = inputs_.size();
   input_range_.emplace_back(std::make_pair(index, index + inputs.size()));
   inputs_.insert(inputs_.end(),
@@ -65,7 +66,8 @@ void CustomOpKernelContext::EmplaceBackOutput(Tensor&& output) {
   output_range_.emplace_back(std::make_pair(index, index + 1));
 }
 
-void CustomOpKernelContext::EmplaceBackOutputs(std::vector<Tensor>&& outputs) {
+void CustomOpKernelContext::EmplaceBackOutputs(
+    const std::vector<Tensor>& outputs) {
   size_t index = outputs_.size();
   output_range_.emplace_back(std::make_pair(index, index + outputs.size()));
   outputs_.insert(outputs_.end(),
@@ -75,6 +77,8 @@ void CustomOpKernelContext::EmplaceBackOutputs(std::vector<Tensor>&& outputs) {
 
 void CustomOpKernelContext::EmplaceBackAttr(paddle::any attr) {
   attrs_.emplace_back(std::move(attr));
+  VLOG(7) << "attrs_ No." << attrs_.size() - 1
+          << " has value of type: " << attrs_[attrs_.size() - 1].type().name();
 }
 
 const Tensor& CustomOpKernelContext::InputAt(size_t idx) const {
@@ -102,6 +106,15 @@ std::vector<Tensor*> CustomOpKernelContext::MutableOutputBetweeen(size_t start,
   return rlt;
 }
 
+std::vector<Tensor> CustomOpKernelContext::OutputsBetweeen(size_t start,
+                                                           size_t end) {
+  std::vector<Tensor> rlt;
+  for (size_t i = start; i < end; ++i) {
+    rlt.emplace_back(outputs_.at(i));
+  }
+  return rlt;
+}
+
 std::vector<Tensor>* CustomOpKernelContext::AllMutableOutput() {
   return &outputs_;
 }
diff --git a/paddle/phi/api/lib/sparse_api_custom_impl.cc b/paddle/phi/api/lib/sparse_api_custom_impl.cc
index 832c19361e5eb..8f8de02e49bdf 100644
--- a/paddle/phi/api/lib/sparse_api_custom_impl.cc
+++ b/paddle/phi/api/lib/sparse_api_custom_impl.cc
@@ -25,25 +25,24 @@ namespace paddle {
 namespace experimental {
 namespace sparse {
 
-Tensor to_sparse_coo_impl(const Tensor& x,
-                          Backend backend,
-                          const int64_t sparse_dim) {
+Tensor to_sparse_coo_impl(const Tensor& x, const int64_t sparse_dim) {
   if (x.layout() == phi::DataLayout::SPARSE_COO) {
     return x;
   }
+
   // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
-  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
   std::string kernel_name = "dense_to_sparse_coo";
   if (x.layout() == phi::DataLayout::SPARSE_CSR) {
     kernel_name = "sparse_csr_to_coo";
   }
 
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+
   auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       kernel_name, kernel_key);
 
-  VLOG(6) << "to API kernel key: " << kernel_key;
+  VLOG(6) << "add API kernel key: " << kernel_key;
   VLOG(6) << "to API kernel: " << kernel;
 
   // 2. Get Device Context
@@ -62,18 +61,18 @@ Tensor to_sparse_coo_impl(const Tensor& x,
 
   // 4. InferMeta
   auto indices_meta =
-      phi::DenseTensorMeta(phi::DataType::INT64, {-1}, phi::DataLayout::NCHW);
-  auto elements_meta = phi::DenseTensorMeta(x.dtype(), {-1}, x.layout());
+      phi::DenseTensorMeta(phi::DataType::INT64, {1}, phi::DataLayout::NCHW);
+  auto elements_meta = phi::DenseTensorMeta(x.dtype(), {1}, x.layout());
 
   // 5. Prepare outputs
   // create empty SparseCooTensor
   phi::DenseTensor non_zero_indices(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(backend)),
+          phi::TransToPhiPlace(kernel_key.backend())),
       std::move(indices_meta));
   phi::DenseTensor non_zero_elements(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(backend)),
+          phi::TransToPhiPlace(kernel_key.backend())),
       std::move(elements_meta));
   auto coo = std::make_shared<phi::SparseCooTensor>(
       non_zero_indices, non_zero_elements, x.dims());
@@ -88,23 +87,23 @@ Tensor to_sparse_coo_impl(const Tensor& x,
   return out;
 }
 
-Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) {
+Tensor to_sparse_csr_impl(const Tensor& x) {
   if (x.layout() == phi::DataLayout::SPARSE_CSR) {
     return x;
   }
   // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
-  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
   std::string kernel_name = "dense_to_sparse_csr";
   if (x.layout() == phi::DataLayout::SPARSE_COO) {
     kernel_name = "sparse_coo_to_csr";
   }
 
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+
   auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       kernel_name, kernel_key);
 
-  VLOG(6) << "to API kernel key: " << kernel_key;
+  VLOG(6) << "add API kernel key: " << kernel_key;
   VLOG(6) << "to API kernel: " << kernel;
 
   // 2. Get Device Context
@@ -122,24 +121,24 @@ Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) {
 
   // 4. InferMeta
   auto crows_meta =
-      phi::DenseTensorMeta(phi::DataType::INT64, {-1}, phi::DataLayout::NCHW);
+      phi::DenseTensorMeta(phi::DataType::INT64, {1}, phi::DataLayout::NCHW);
   auto cols_meta =
-      phi::DenseTensorMeta(phi::DataType::INT64, {-1}, phi::DataLayout::NCHW);
-  auto elements_meta = phi::DenseTensorMeta(x.dtype(), {-1}, x.layout());
+      phi::DenseTensorMeta(phi::DataType::INT64, {1}, phi::DataLayout::NCHW);
+  auto elements_meta = phi::DenseTensorMeta(x.dtype(), {1}, x.layout());
 
   // 5. Prepare outputs
   // create empty SparseCooTensor
   phi::DenseTensor non_zero_crows(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(backend)),
+          phi::TransToPhiPlace(kernel_key.backend())),
       std::move(crows_meta));
   phi::DenseTensor non_zero_cols(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(backend)),
+          phi::TransToPhiPlace(kernel_key.backend())),
       std::move(cols_meta));
   phi::DenseTensor non_zero_elements(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(backend)),
+          phi::TransToPhiPlace(kernel_key.backend())),
       std::move(elements_meta));
   auto csr = std::make_shared<phi::SparseCsrTensor>(
       non_zero_crows, non_zero_cols, non_zero_elements, x.dims());
@@ -154,24 +153,25 @@ Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) {
   return out;
 }
 
-Tensor to_dense_impl(const Tensor& x, Backend backend) {
+Tensor to_dense_impl(const Tensor& x) {
   if (x.layout() != phi::DataLayout::SPARSE_CSR &&
       x.layout() != phi::DataLayout::SPARSE_COO) {
     return x;
   }
+
   // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
-  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
   std::string kernel_name = "sparse_coo_to_dense";
   if (x.layout() == phi::DataLayout::SPARSE_CSR) {
     kernel_name = "sparse_csr_to_dense";
   }
 
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+
   auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       kernel_name, kernel_key);
 
-  VLOG(6) << "to API kernel key: " << kernel_key;
+  VLOG(6) << "add API kernel key: " << kernel_key;
   VLOG(6) << "to API kernel: " << kernel;
 
   // 2. Get Device Context
@@ -194,7 +194,7 @@ Tensor to_dense_impl(const Tensor& x, Backend backend) {
   // create empty SparseCooTensor
   auto dense_out = std::make_shared<phi::DenseTensor>(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(backend)),
+          phi::TransToPhiPlace(kernel_key.backend())),
       std::move(dense_meta));
 
   kernel_context.EmplaceBackOutput(dense_out.get());
diff --git a/paddle/phi/api/lib/sparse_api_custom_impl.h b/paddle/phi/api/lib/sparse_api_custom_impl.h
index 293b2cfa3d334..6053d281f0ff1 100644
--- a/paddle/phi/api/lib/sparse_api_custom_impl.h
+++ b/paddle/phi/api/lib/sparse_api_custom_impl.h
@@ -21,13 +21,11 @@ namespace paddle {
 namespace experimental {
 namespace sparse {
 
-Tensor to_dense_impl(const Tensor& x, Backend backend);
+Tensor to_dense_impl(const Tensor& x);
 
-Tensor to_sparse_coo_impl(const Tensor& x,
-                          Backend backend,
-                          const int64_t sparse_dim);
+Tensor to_sparse_coo_impl(const Tensor& x, const int64_t sparse_dim);
 
-Tensor to_sparse_csr_impl(const Tensor& x, Backend backend);
+Tensor to_sparse_csr_impl(const Tensor& x);
 
 }  // namespace sparse
 }  // namespace experimental
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index 311dd0fc30941..066287d424479 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -25,6 +25,8 @@ limitations under the License. */
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/tensor_utils.h"
@@ -46,6 +48,7 @@ limitations under the License. */
  * In the future, the necessary components will be moved to the this library,
  * or the corresponding components will be re-implemented.
  */
+
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/stream/cuda_stream.h"
@@ -111,8 +114,8 @@ void Tensor::reshape(const std::vector<int64_t> &shape) {
                   "touching underlying data, this requires the total size of "
                   "the tensor to remain constant.";
   if (is_dense_tensor()) {
-    std::dynamic_pointer_cast<phi::DenseTensor>(impl_)->set_meta(
-        phi::DenseTensorMeta(dtype(), phi::make_ddim(shape)));
+    std::dynamic_pointer_cast<phi::DenseTensor>(impl_)->Resize(
+        phi::make_ddim(shape));
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support reshape operation on DenseTensor now."));
@@ -131,6 +134,12 @@ bool Tensor::is_dense_tensor() const {
 bool Tensor::is_selected_rows() const {
   return phi::SelectedRows::classof(impl_.get());
 }
+bool Tensor::is_sparse_coo_tensor() const {
+  return phi::SparseCooTensor::classof(impl_.get());
+}
+bool Tensor::is_sparse_csr_tensor() const {
+  return phi::SparseCsrTensor::classof(impl_.get());
+}
 /* Part 3: Device and Backend methods */
 
 PlaceType Tensor::place() const {
@@ -142,7 +151,12 @@ PlaceType Tensor::place() const {
 }
 
 paddle::platform::Place Tensor::inner_place() const {
-  return ConvertExtPlaceToInnerPlace(place());
+  PADDLE_ENFORCE_NOT_NULL(
+      impl_,
+      phi::errors::PermissionDenied(
+          "Null pointer error, the impl_ of Tensor should not be "
+          "Null when calling Tensor::inner_place()."));
+  return impl_->place();
 }
 
 bool Tensor::is_cpu() const {
@@ -286,12 +300,16 @@ Tensor Tensor::slice(int64_t begin_idx, int64_t end_idx) const {
   }
 }
 
-std::shared_ptr<phi::TensorBase> Tensor::impl() const { return impl_; }
+const std::shared_ptr<phi::TensorBase> &Tensor::impl() const { return impl_; }
 
 void Tensor::set_impl(const std::shared_ptr<phi::TensorBase> &impl) {
   impl_ = impl;
 }
 
+void Tensor::set_impl(std::shared_ptr<phi::TensorBase> &&impl) {
+  impl_ = std::move(impl);
+}
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 gpuStream_t Tensor::stream() const {
   return platform::stream::get_current_stream(-1)->raw_stream();
@@ -337,5 +355,36 @@ void Tensor::set_autograd_meta(
   autograd_meta_ = std::move(autograd_meta);
 }
 
+void Tensor::bump_inplace_version() {
+  if (is_dense_tensor()) {
+    auto &inplace_version_counter =
+        std::dynamic_pointer_cast<phi::DenseTensor>(impl_)
+            ->InplaceVersionCounter();
+    VLOG(3) << "yoki: before bump inplace version: "
+            << inplace_version_counter.CurrentVersion();
+    inplace_version_counter.Bump();
+    VLOG(3) << "yoki: after bump inplace version: "
+            << inplace_version_counter.CurrentVersion();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "bump_inplace_version is only supported on DenseTensor now."));
+  }
+}
+
+uint32_t Tensor::current_inplace_version() {
+  if (is_dense_tensor()) {
+    auto &inplace_version_counter =
+        std::dynamic_pointer_cast<phi::DenseTensor>(impl_)
+            ->InplaceVersionCounter();
+    VLOG(3) << "yoki: print version: "
+            << inplace_version_counter.CurrentVersion();
+    return inplace_version_counter.CurrentVersion();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "current_inplace_version is only supported on DenseTensor now."));
+  }
+  return 0;
+}
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc
index aefa26952d1e5..cc797507e68ec 100644
--- a/paddle/phi/api/lib/tensor_method.cc
+++ b/paddle/phi/api/lib/tensor_method.cc
@@ -15,12 +15,16 @@ limitations under the License. */
 #include "paddle/phi/api/include/tensor.h"
 
 #include "paddle/phi/api/lib/ext_compat_utils.h"
+#include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/tensor_base.h"
 
+#include "paddle/phi/api/lib/api_gen_utils.h"
+#include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace experimental {
-
 // declare cast api
 Tensor cast(const Tensor &x, DataType out_dtype);
 Tensor copy_to(const Tensor &x, Backend backend, bool blocking);
@@ -66,12 +70,18 @@ template PADDLE_API Tensor Tensor::copy_to<phi::dtype::complex<double>>(
 template PADDLE_API Tensor
 Tensor::copy_to<phi::dtype::float16>(const PlaceType &target_place) const;
 
-void Tensor::copy_(const Tensor &src, bool blocking) {
+void Tensor::copy_(const Tensor &src,
+                   const phi::Place &target_place,
+                   bool blocking) {
   if (!src.is_initialized()) {
+    VLOG(8) << "Src is empty, skip copy";
     return;
   }
+  // Prepare copy kernel key and outputs
+  auto kernel_key_set = ParseKernelKeyByInputArgs(src);
+  KernelType kernel_type = ParseKernelTypeByInputArgs(src);
   VLOG(3) << "Deep copy Tensor from " << src.name() << " to " << name();
-  if (defined()) {
+  if (is_initialized()) {
     PADDLE_ENFORCE_EQ(dtype(),
                       src.dtype(),
                       platform::errors::PreconditionNotMet(
@@ -86,10 +96,91 @@ void Tensor::copy_(const Tensor &src, bool blocking) {
                           "Copy cannot be performed!",
                           name(),
                           src.name()));
+    PADDLE_ENFORCE_EQ(target_place,
+                      inner_place(),
+                      platform::errors::PreconditionNotMet(
+                          "Place is different of dst tensor and args %s, which "
+                          "current tensor holds %s "
+                          "Copy cannot be performed!",
+                          target_place.DebugString(),
+                          inner_place().DebugString()));
+    kernel_key_set.backend_set =
+        kernel_key_set.backend_set |
+        BackendSet(phi::TransToPhiBackend(inner_place()));
+  } else {
+    // Deep Copy AutoGrad info from src to self.
+    *autograd_meta_ = *(src.autograd_meta_);
+  }
+
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+  auto *dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+
+  if (kernel_backend == Backend::UNDEFINED ||
+      kernel_layout == DataLayout::UNDEFINED ||
+      kernel_data_type == DataType::UNDEFINED) {
+    if (kernel_backend == Backend::UNDEFINED) {
+      kernel_backend = kernel_key.backend();
+    }
+    if (kernel_layout == DataLayout::UNDEFINED) {
+      kernel_layout = kernel_key.layout();
+    }
+    if (kernel_data_type == DataType::UNDEFINED) {
+      kernel_data_type = kernel_key.dtype();
+    }
+  }
+
+  if (kernel_type == KernelType::DENSE_TENSOR_KENREL) {
+    auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+        "copy", {kernel_backend, kernel_layout, kernel_data_type});
+    VLOG(6) << "copy API kernel key: " << kernel_key;
+    VLOG(6) << "copy API kernel: " << kernel;
+    using kernel_signature = void (*)(const platform::DeviceContext &,
+                                      const phi::DenseTensor &,
+                                      phi::Place,
+                                      bool,
+                                      phi::DenseTensor *);
+    SetKernelOutput(kernel_backend, this);
+    phi::MetaTensor meta_out(impl_.get());
+    phi::UnchangedInferMeta(
+        MakeMetaTensor(
+            *(std::static_pointer_cast<phi::DenseTensor>(src.impl_))),
+        &meta_out);
+    auto *kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+    (*kernel_fn)(*dev_ctx,
+                 (*(std::static_pointer_cast<phi::DenseTensor>(src.impl_))),
+                 target_place,
+                 blocking,
+                 static_cast<phi::DenseTensor *>(impl_.get()));
+  } else if (kernel_type == KernelType::SELECTED_ROWS_KENREL) {
+    auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+        "copy_sr", {kernel_backend, kernel_layout, kernel_data_type});
+    VLOG(6) << "copy API kernel key: " << kernel_key;
+    VLOG(6) << "copy API kernel: " << kernel;
+    using kernel_signature = void (*)(const platform::DeviceContext &,
+                                      const phi::SelectedRows &,
+                                      phi::Place,
+                                      bool,
+                                      phi::SelectedRows *);
+    SetSelectedRowsKernelOutput(kernel_backend, this);
+    phi::MetaTensor meta_out(impl_.get());
+    phi::UnchangedInferMeta(
+        MakeMetaTensor(
+            *(std::static_pointer_cast<phi::SelectedRows>(src.impl_))),
+        &meta_out);
+    auto *kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+    (*kernel_fn)(*dev_ctx,
+                 (*(std::static_pointer_cast<phi::SelectedRows>(src.impl_))),
+                 target_place,
+                 blocking,
+                 static_cast<phi::SelectedRows *>(impl_.get()));
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "We currently only support dense tensor copy for now and if u need to "
+        "copy selected rows please raise a issue."));
   }
-  auto copy_tensor =
-      src.copy_to(phi::TransToPhiBackend(src.inner_place()), blocking);
-  set_impl(copy_tensor.impl());
 }
 
 }  // namespace experimental
diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt
index 6d056b54b7005..271a58222f0c0 100644
--- a/paddle/phi/api/lib/utils/CMakeLists.txt
+++ b/paddle/phi/api/lib/utils/CMakeLists.txt
@@ -1,2 +1,2 @@
 cc_library(phi_api_utils SRCS storage.cc tensor_utils.cc DEPS
-tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits)
+tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits scalar)
diff --git a/paddle/phi/api/lib/utils/tensor_utils.cc b/paddle/phi/api/lib/utils/tensor_utils.cc
index 1c9f7c3a8683d..3d183ea7fee8b 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.cc
+++ b/paddle/phi/api/lib/utils/tensor_utils.cc
@@ -40,6 +40,13 @@ phi::Scalar MakePhiScalarFromVar(const framework::Variable& variable) {
   auto expected_place = phi::TransToPhiPlace(phi::Backend::CPU);
   if (variable.IsType<framework::LoDTensor>()) {
     const auto& tensor = variable.Get<framework::LoDTensor>();
+    PADDLE_ENFORCE_EQ(
+        tensor.numel(),
+        1UL,
+        platform::errors::InvalidArgument("The DenseTensor used to construct "
+                                          "the Scalar contains more than 1 "
+                                          "value, it contains `%d` values.",
+                                          tensor.numel()));
     if (!platform::is_same_place(tensor.place(), expected_place)) {
       framework::LoDTensor tmp_tensor;
       framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
diff --git a/paddle/phi/backends/device_ext.h b/paddle/phi/backends/device_ext.h
index bbd4966b7274f..6315fe15afdf1 100644
--- a/paddle/phi/backends/device_ext.h
+++ b/paddle/phi/backends/device_ext.h
@@ -523,6 +523,15 @@ struct CustomRuntimeParams {
   char reserved[32];
 };
 
+#define PADDLE_CUSTOM_RUNTIME_CHECK_VERSION(params)             \
+  if ((params)->size != sizeof(DevicePluginParams) &&           \
+      (params)->interface->size != sizeof(C_DeviceInterface)) { \
+    return;                                                     \
+  }                                                             \
+  (params)->version.major = PADDLE_DEVICE_PLUGIN_MAJOR_VERSION; \
+  (params)->version.minor = PADDLE_DEVICE_PLUGIN_MINOR_VERSION; \
+  (params)->version.patch = PADDLE_DEVICE_PLUGIN_PATCH_VERSION;
+
 // Plugin implement it and fill CustomRuntimeParams
 void InitPlugin(CustomRuntimeParams*);
 
diff --git a/paddle/phi/backends/device_manager.cc b/paddle/phi/backends/device_manager.cc
index 1ffe38d8e1f4c..35339aed0f3e1 100644
--- a/paddle/phi/backends/device_manager.cc
+++ b/paddle/phi/backends/device_manager.cc
@@ -393,6 +393,11 @@ DeviceManager& DeviceManager::Instance() {
   return platform_manager;
 }
 
+void DeviceManager::Clear() {
+  Instance().device_map_.clear();
+  Instance().device_impl_map_.clear();
+}
+
 std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
   std::vector<std::string> libraries;
   std::regex express(".*\\.so");
diff --git a/paddle/phi/backends/device_manager.h b/paddle/phi/backends/device_manager.h
index c0911a0f8d50c..39eef27b4a607 100644
--- a/paddle/phi/backends/device_manager.h
+++ b/paddle/phi/backends/device_manager.h
@@ -158,6 +158,8 @@ class DeviceManager {
 
   static std::vector<size_t> GetDeviceList(const std::string& device_type);
 
+  static void Clear();
+
  private:
   DISABLE_COPY_AND_ASSIGN(DeviceManager);
   DeviceManager() {}
diff --git a/paddle/phi/backends/dynload/lapack.h b/paddle/phi/backends/dynload/lapack.h
index 75fc8fd9a3c6e..c81c66c69282f 100644
--- a/paddle/phi/backends/dynload/lapack.h
+++ b/paddle/phi/backends/dynload/lapack.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
 #include "paddle/phi/backends/dynload/port.h"
 
-// Note(zhouwei): because lapack doesn't provide appropriate header file.
-// should expose API statement yourself.
+// Because lapack doesn't provide appropriate header file,
+// we should expose API statement yourself.
 
 // getrf_(For example)
 extern "C" void dgetrf_(
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index 09deb575f2414..0394835aa8b70 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -654,10 +654,17 @@ struct GPUContext::Impl {
   }
 
   void AddStreamCallback(const std::function<void()>& callback) const {
-    // TODO(wilber): Do we need ThreadPool?
-    auto* func = new std::function<void()>([this, callback] {
+    // NOTE(zhiqiu): better use threadpool here, otherwise "std::async" may
+    // launch too
+    // many threads and result in thread oversubscription.
+    auto* callback_func = new std::function<void()>(std::move(callback));
+    auto* func = new std::function<void()>([this, callback_func] {
       std::lock_guard<std::mutex> lock(stream_call_back_mtx_);
-      last_future_ = std::async(std::launch::deferred, [&]() { callback(); });
+      VLOG(4) << "Stream callback";
+      last_future_ = std::async(std::launch::async, [callback_func]() {
+        std::unique_ptr<std::function<void()>> releaser(callback_func);
+        (*callback_func)();
+      });
     });
 
 #ifdef PADDLE_WITH_HIP
@@ -734,6 +741,10 @@ struct GPUContext::Impl {
 
 GPUContext::GPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {}
 
+GPUContext::GPUContext(GPUContext&&) = default;
+
+GPUContext& GPUContext::operator=(GPUContext&&) = default;
+
 GPUContext::GPUContext(const GPUPlace& place)
     : DeviceContext(), impl_(std::make_unique<Impl>(place)) {}
 
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index 3eb4360ad3538..cd08da1c0f2f8 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -77,6 +77,8 @@ class DnnWorkspaceHandle {
 class GPUContext : public DeviceContext {
  public:
   GPUContext();
+  GPUContext(GPUContext&&);
+  GPUContext& operator=(GPUContext&&);
 
   explicit GPUContext(const GPUPlace& place);
 
diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt
index 85a1424ee34e0..9bf692703860f 100644
--- a/paddle/phi/common/CMakeLists.txt
+++ b/paddle/phi/common/CMakeLists.txt
@@ -1 +1,2 @@
 cc_library(phi_place SRCS place.cc)
+cc_library(scalar SRCS scalar.cc DEPS phi_enforce)
diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc
index 644bf3679af2a..2b5254d3d5f14 100644
--- a/paddle/phi/common/place.cc
+++ b/paddle/phi/common/place.cc
@@ -92,4 +92,20 @@ std::string GetGlobalDeviceType(size_t device_type_id) {
   return global_registered_device_type[device_type_id];
 }
 
+constexpr static int kAllocationTypeBitLength = 8;
+constexpr static int kDeviceTypeIDBitLength = 8;
+constexpr static int kDeviceIDBitLength = 8;
+
+uint32_t Place::Hash::operator()(const Place &place) const {
+  uint32_t hash_value = 0;
+  // |----31-24------|-----23-16------|-----15-08----|---7-0----|
+  // | For extension | AllocationType | DeviceTypeID | DeviceID |
+  hash_value |= (static_cast<uint8_t>(place.alloc_type_)
+                 << (kDeviceIDBitLength + kDeviceTypeIDBitLength));
+  hash_value |=
+      (static_cast<uint8_t>(place.device_type_id_) << kDeviceIDBitLength);
+  hash_value |= static_cast<uint8_t>(place.device);
+  return hash_value;
+}
+
 }  // namespace phi
diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h
index 36fb910cad6c7..53ddd499a7e24 100644
--- a/paddle/phi/common/place.h
+++ b/paddle/phi/common/place.h
@@ -73,31 +73,23 @@ class Place {
 
   std::string DebugString() const;
 
+  struct Hash {
+    // Note: Now the number of bits we need does not exceed 32 bits, so there is
+    // no need to use 64 bits. If needed in the future, it can be expanded,
+    // but now we don’t over-design.
+    uint32_t operator()(const Place& place) const;
+  };
+
+  uint32_t HashValue() const { return Hash()(*this); }
+
   inline bool operator==(const Place& rhs) const {
-    if (alloc_type_ != rhs.GetType()) {
-      return false;
-    }
-    if (alloc_type_ == AllocationType::CPU ||
-        alloc_type_ == AllocationType::GPUPINNED ||
-        alloc_type_ == AllocationType::NPUPINNED) {
-      return true;
-    }
-    if (alloc_type_ == AllocationType::CUSTOM) {
-      return device_type_id_ == rhs.device_type_id_ &&
-             device == rhs.GetDeviceId();
-    }
-    return device == rhs.GetDeviceId();
+    return HashValue() == rhs.HashValue();
+  }
+  inline bool operator!=(const Place& rhs) const {
+    return HashValue() != rhs.HashValue();
   }
-  inline bool operator!=(const Place& rhs) const { return !(*this == rhs); }
   inline bool operator<(const Place& rhs) const {
-    if (alloc_type_ != rhs.GetType()) {
-      return static_cast<int>(alloc_type_) < static_cast<int>(rhs.GetType());
-    }
-    if (alloc_type_ == AllocationType::CUSTOM &&
-        device_type_id_ != rhs.device_type_id_) {
-      return device_type_id_ < rhs.device_type_id_;
-    }
-    return device < rhs.GetDeviceId();
+    return HashValue() < rhs.HashValue();
   }
 
  public:
@@ -206,3 +198,10 @@ class CustomPlace : public Place {
 std::ostream& operator<<(std::ostream&, const Place&);
 
 }  // namespace phi
+
+namespace paddle {
+namespace experimental {
+using AllocationType = phi::AllocationType;
+using Place = phi::Place;
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/phi/common/scalar.cc b/paddle/phi/common/scalar.cc
new file mode 100644
index 0000000000000..5cd55c1e88bed
--- /dev/null
+++ b/paddle/phi/common/scalar.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/common/scalar.h"
+
+#include "paddle/phi/core/enforce.h"
+
+namespace paddle {
+namespace experimental {
+
+// NOTE(xiongkun): why we put definition here?
+// test_custom_op can't include enforce.h, because enforce.h includes gflags.
+// so we decouple the include dependence of enforce.h by link.
+void ThrowTensorConvertError(int num) {
+  PADDLE_ENFORCE_EQ(num,
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The Scalar only supports Tensor with 1 element, but "
+                        "now Tensor has `%d` elements",
+                        num));
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h
index 72cef89d300c8..5134f4eb72639 100644
--- a/paddle/phi/common/scalar.h
+++ b/paddle/phi/common/scalar.h
@@ -19,9 +19,12 @@ limitations under the License. */
 
 #include "paddle/phi/api/ext/exception.h"
 #include "paddle/phi/api/include/tensor.h"
+
 namespace paddle {
 namespace experimental {
 
+void ThrowTensorConvertError(int);
+
 template <typename T>
 class ScalarBase {
  public:
@@ -104,11 +107,7 @@ class ScalarBase {
   // The Tensor must have one dim
   ScalarBase(const T& tensor) : dtype_(tensor.dtype()) {  // NOLINT
     is_from_tensor_ = true;
-    PD_CHECK(
-        tensor.numel() == 1,
-        "The Scalar only supports Tensor with 1 element, but now Tensor has `",
-        tensor.numel(),
-        "` element.");
+    ThrowTensorConvertError(tensor.numel());
     switch (dtype_) {
       case DataType::FLOAT32:
         data_.f32 = tensor.template data<float>()[0];
@@ -156,6 +155,8 @@ class ScalarBase {
     CopyScalar(other, this);
   }
 
+  // NOTE(xiongkun): some op need to judge the dtype of the Scalar, we expose a
+  // interface.
   bool FromTensor() const { return is_from_tensor_; }
 
   void SetFromTensor(bool from_tensor) { is_from_tensor_ = from_tensor; }
diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h
index 688a0e54a0cf4..71cec01141164 100644
--- a/paddle/phi/core/compat/arg_map_context.h
+++ b/paddle/phi/core/compat/arg_map_context.h
@@ -89,6 +89,8 @@ class ArgumentMappingContext {
 
   virtual bool IsDenseTensorInput(const std::string& name) const = 0;
   virtual bool IsSelectedRowsInput(const std::string& name) const = 0;
+  // For compatibility with LoDTensorArray
+  virtual bool IsDenseTensorVectorInput(const std::string& name) const = 0;
 
   virtual bool IsDenseTensorOutput(const std::string& name) const = 0;
   virtual bool IsSelectedRowsOutput(const std::string& name) const = 0;
@@ -96,6 +98,13 @@ class ArgumentMappingContext {
   // use this function to mark it comes from InferShapeArgumentMappingContext
   // and will be used in infershape
   virtual bool IsForInferShape() const = 0;
+
+  // NOTE(paddle-dev): [ Why do we export this interface? ]
+  // In old Fluid framework, some operators' Attribute can be a Tensor or
+  // TensorList. In this case, the InferShape logic will be different
+  // under CompileTime and RuntimeTime. So we export this interface to
+  // handle it conveniently. See "gaussian_random_sig.cc" for details.
+  virtual bool IsRuntime() const { return true; }
 };
 
 }  // namespace phi
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index 1ab718c079438..946230cb169d2 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -47,12 +47,26 @@ const std::unordered_set<std::string> deprecated_op_names({"diag",
                                                            "matmul_grad",
                                                            "matmul_grad_grad",
                                                            "mean",
+                                                           "mean_grad",
                                                            "max",
+                                                           "max_grad",
+                                                           "min",
+                                                           "min_grad",
+                                                           "prod",
+                                                           "prod_grad",
+                                                           "any",
+                                                           "all",
                                                            "reshape",
                                                            "reshape_grad",
                                                            "expand",
+                                                           "expand_as",
                                                            "expand_grad",
-                                                           "sum"});
+                                                           "expand_as_grad",
+                                                           "sum",
+                                                           "one_hot",
+                                                           "sum_grad",
+                                                           "top_k",
+                                                           "top_k_grad"});
 
 class DefaultKernelSignatureMap {
  public:
diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc
index bc317da8d98ed..48778bb38e548 100644
--- a/paddle/phi/core/custom_kernel.cc
+++ b/paddle/phi/core/custom_kernel.cc
@@ -33,6 +33,10 @@ void CustomKernelMap::RegisterCustomKernel(const std::string& name,
 void CustomKernelMap::RegisterCustomKernels() {
   VLOG(3) << "Size of custom_kernel_map: " << kernels_.size();
 
+  if (kernels_.size() <= 0) {
+    LOG(INFO) << "No custom kernel info found in loaded lib(s).";
+    return;
+  }
   auto& kernels = KernelFactory::Instance().kernels();
   for (auto& pair : kernels_) {
     PADDLE_ENFORCE_NE(
@@ -60,9 +64,10 @@ void CustomKernelMap::RegisterCustomKernels() {
               << info_pair.first
               << "] to Paddle. It will be used like native ones.";
     }
-    kernels_[pair.first].clear();
   }
-  LOG(INFO) << "Successed in loading custom kernels.";
+  LOG(INFO) << "Successed in loading " << kernels_.size()
+            << " custom kernel(s) from loaded lib(s), will be "
+            << "used like native ones.";
   kernels_.clear();
 }
 
diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc
index 7a0f50533360d..2e185fc0ca22b 100644
--- a/paddle/phi/core/dense_tensor.cc
+++ b/paddle/phi/core/dense_tensor.cc
@@ -110,8 +110,9 @@ void* DenseTensor::AllocateFrom(Allocator* allocator,
 template <typename T>
 const T* DenseTensor::data() const {
   check_memory_size();
-  PADDLE_ENFORCE(
-      (dtype() == paddle::experimental::CppTypeToDataType<T>::Type()),
+  PADDLE_ENFORCE_EQ(
+      dtype(),
+      paddle::experimental::CppTypeToDataType<T>::Type(),
       phi::errors::InvalidArgument(
           "The type of data we are trying to retrieve does not match the "
           "type of data currently contained in the container."));
diff --git a/paddle/phi/core/kernel_context.cc b/paddle/phi/core/kernel_context.cc
index a32e0e44f4696..234e3528c363b 100644
--- a/paddle/phi/core/kernel_context.cc
+++ b/paddle/phi/core/kernel_context.cc
@@ -37,6 +37,13 @@ void KernelContext::EmplaceBackInputs(
                  std::make_move_iterator(inputs.end()));
 }
 
+void KernelContext::EmplaceBackInputsWithoutSetRange(
+    paddle::SmallVector<const TensorBase*> inputs) {
+  inputs_.insert(inputs_.end(),
+                 std::make_move_iterator(inputs.begin()),
+                 std::make_move_iterator(inputs.end()));
+}
+
 void KernelContext::EmplaceBackOutput(TensorBase* output) {
   int index = outputs_.size();
   outputs_.emplace_back(output);
@@ -59,6 +66,13 @@ void KernelContext::EmplaceBackOutputs(
                   std::make_move_iterator(outputs.end()));
 }
 
+void KernelContext::EmplaceBackOutputsWithoutSetRange(
+    paddle::SmallVector<TensorBase*> outputs) {
+  outputs_.insert(outputs_.end(),
+                  std::make_move_iterator(outputs.begin()),
+                  std::make_move_iterator(outputs.end()));
+}
+
 void KernelContext::EmplaceBackAttr(paddle::any attr) {
   attrs_.emplace_back(std::move(attr));
 }
diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h
index 213ac47d30bfd..d3ca1ffc61c42 100644
--- a/paddle/phi/core/kernel_context.h
+++ b/paddle/phi/core/kernel_context.h
@@ -52,12 +52,18 @@ class KernelContext {
 
   void EmplaceBackInputs(paddle::SmallVector<const TensorBase*> inputs);
 
+  void EmplaceBackInputsWithoutSetRange(
+      paddle::SmallVector<const TensorBase*> inputs);
+
   void EmplaceBackOutput(TensorBase* output);
 
   void EmplaceBackOutputWithoutSetRange(TensorBase* output);
 
   void EmplaceBackOutputs(paddle::SmallVector<TensorBase*> outputs);
 
+  void EmplaceBackOutputsWithoutSetRange(
+      paddle::SmallVector<TensorBase*> outputs);
+
   void EmplaceBackAttr(paddle::any attr);
 
   const std::pair<int, int>& InputRangeAt(size_t idx) const;
diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h
index be91409762635..e502b9cb3e025 100644
--- a/paddle/phi/core/kernel_factory.h
+++ b/paddle/phi/core/kernel_factory.h
@@ -197,8 +197,16 @@ class Kernel {
 
   const KernelArgsDef& args_def() const { return args_def_; }
 
+  const TensorArgDef& InputAt(size_t idx) const {
+    return args_def_.input_defs().at(idx);
+  }
+
   TensorArgDef& InputAt(size_t idx) { return args_def_.input_defs().at(idx); }
 
+  const TensorArgDef& OutputAt(size_t idx) const {
+    return args_def_.output_defs().at(idx);
+  }
+
   TensorArgDef& OutputAt(size_t idx) { return args_def_.output_defs().at(idx); }
 
   bool IsValid() { return fn_ != nullptr; }
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index d9ed68593cd61..c3356eadcbd21 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -98,6 +98,28 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
+      } else if (arg_type == std::type_index(typeid(const SparseCooTensor&))) {
+        args_def->AppendInput(default_key.backend(),
+                              default_tensor_layout,
+                              default_key.dtype(),
+                              arg_type);
+      } else if (arg_type == std::type_index(typeid(
+                                 paddle::optional<const SparseCooTensor&>))) {
+        args_def->AppendInput(default_key.backend(),
+                              default_tensor_layout,
+                              default_key.dtype(),
+                              arg_type);
+      } else if (arg_type == std::type_index(typeid(const SparseCsrTensor&))) {
+        args_def->AppendInput(default_key.backend(),
+                              default_tensor_layout,
+                              default_key.dtype(),
+                              arg_type);
+      } else if (arg_type == std::type_index(typeid(
+                                 paddle::optional<const SparseCsrTensor&>))) {
+        args_def->AppendInput(default_key.backend(),
+                              default_tensor_layout,
+                              default_key.dtype(),
+                              arg_type);
       } else if (arg_type == std::type_index(typeid(DenseTensor*))) {
         args_def->AppendOutput(default_key.backend(),
                                default_tensor_layout,
@@ -114,6 +136,16 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                                default_tensor_layout,
                                default_key.dtype(),
                                arg_type);
+      } else if (arg_type == std::type_index(typeid(SparseCooTensor*))) {
+        args_def->AppendOutput(default_key.backend(),
+                               default_tensor_layout,
+                               default_key.dtype(),
+                               arg_type);
+      } else if (arg_type == std::type_index(typeid(SparseCsrTensor*))) {
+        args_def->AppendOutput(default_key.backend(),
+                               default_tensor_layout,
+                               default_key.dtype(),
+                               arg_type);
       } else {
         // Attribute deal with
         // TODO(chenweihang): now here allow any types of attribute, maybe
diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc
index eb114304f53ea..bcbb1a4835b9d 100644
--- a/paddle/phi/core/meta_tensor.cc
+++ b/paddle/phi/core/meta_tensor.cc
@@ -72,6 +72,10 @@ void MetaTensor::set_layout(DataLayout layout) {
 }
 
 void MetaTensor::share_lod(const MetaTensor& meta_tensor) {
+  if (meta_tensor.lod().size() == 0) {
+    // no need share
+    return;
+  }
   if (phi::DenseTensor::classof(tensor_)) {
     DenseTensorUtils::GetMutableMeta(static_cast<DenseTensor*>(tensor_))->lod =
         meta_tensor.lod();
@@ -110,7 +114,7 @@ void MetaTensor::share_meta(const MetaTensor& meta_tensor) {
   }
 }
 
-TensorBase* MetaTensor::get_tensor() const { return tensor_; }
+TensorBase* MetaTensor::tensor() const { return tensor_; }
 
 void MetaTensor::share_dims(const MetaTensor& meta_tensor) {
   bool is_dense_tensor = phi::DenseTensor::classof(tensor_);
@@ -118,7 +122,7 @@ void MetaTensor::share_dims(const MetaTensor& meta_tensor) {
   if (is_dense_tensor || is_selected_rows) {
     set_dims(meta_tensor.dims());
     if (is_selected_rows) {
-      const auto in_tensor_base = meta_tensor.get_tensor();
+      const auto in_tensor_base = meta_tensor.tensor();
       PADDLE_ENFORCE_EQ(
           phi::SelectedRows::classof(in_tensor_base),
           true,
diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h
index 3971a9f7e99e0..10c3a7c1a3de3 100644
--- a/paddle/phi/core/meta_tensor.h
+++ b/paddle/phi/core/meta_tensor.h
@@ -26,11 +26,13 @@ namespace phi {
 // TODO(chenweihang): add other flags if needed
 struct MetaConfig {
   bool is_runtime{true};
-
+  bool is_run_mkldnn_kernel{false};
   MetaConfig() = default;
 
   // supporting implicit construction is easier to use
-  MetaConfig(bool is_runtime) : is_runtime(is_runtime) {}  // NOLINT
+  MetaConfig(bool is_runtime, bool is_run_mkldnn_kernel)
+      : is_runtime(is_runtime),
+        is_run_mkldnn_kernel(is_run_mkldnn_kernel) {}  // NOLINT
 };
 
 class MetaTensor {
@@ -66,7 +68,7 @@ class MetaTensor {
   // Because the lod in compiletime and runtime is different,
   // so `LoD` cannot in public methods
   const LoD& lod() const;
-  TensorBase* get_tensor() const;
+  TensorBase* tensor() const;
   TensorBase* tensor_;
 };
 
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 4ddef5b0002e2..b680222f86350 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -64,6 +64,55 @@ void BilinearTensorProductGradInferMeta(const MetaTensor& x,
   }
 }
 
+void ConvTransposeGradInferMeta(const MetaTensor& x,
+                                const MetaTensor& filter,
+                                const MetaTensor& dout,
+                                const std::vector<int>& strides,
+                                const std::vector<int>& paddings,
+                                const std::vector<int>& output_padding,
+                                const std::vector<int>& output_size,
+                                const std::string& padding_algorithm,
+                                int groups,
+                                const std::vector<int>& dilations,
+                                const std::string& data_format,
+                                MetaTensor* dx,
+                                MetaTensor* dfilter) {
+  GeneralBinaryGradInferMeta(x, filter, dx, dfilter);
+}
+
+void Conv2dTransposeDoubleGradInferMeta(const MetaTensor& x,
+                                        const MetaTensor& filter,
+                                        const MetaTensor& dout,
+                                        const MetaTensor& ddx,
+                                        const MetaTensor& ddfilter,
+                                        const std::vector<int>& strides,
+                                        const std::vector<int>& paddings,
+                                        const std::vector<int>& output_padding,
+                                        const std::vector<int>& output_size,
+                                        const std::string& padding_algorithm,
+                                        int groups,
+                                        const std::vector<int>& dilations,
+                                        const std::string& data_format,
+                                        MetaTensor* dx,
+                                        MetaTensor* dfilter,
+                                        MetaTensor* ddout) {
+  GeneralBinaryGradInferMeta(x, filter, dx, dfilter);
+
+  if (ddout) {
+    ddout->share_meta(dout);
+  }
+}
+
+void GatherNdGradInferMeta(const MetaTensor& x,
+                           const MetaTensor& index,
+                           const MetaTensor& out_grad,
+                           MetaTensor* x_grad) {
+  const auto& dtype = out_grad.dtype();
+  x_grad->set_dims(x.dims());
+  x_grad->share_lod(x);
+  x_grad->set_dtype(dtype);
+}
+
 void GeneralBinaryGradInferMeta(const MetaTensor& x,
                                 const MetaTensor& y,
                                 MetaTensor* dx,
@@ -93,6 +142,12 @@ void GeneralTernaryGradInferMeta(const MetaTensor& x,
   }
 }
 
+void GeneralUnaryGradInferMeta(const MetaTensor& x, MetaTensor* dx) {
+  if (dx) {
+    dx->share_meta(x);
+  }
+}
+
 void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
                                 const MetaTensor& dout,
                                 int axis,
@@ -102,17 +157,49 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
       dout.dims(),
       errors::InvalidArgument(
           "Input(Out) and its gradients should have the same shape."));
+
   dx->share_meta(dout);
 }
 
-void GatherNdGradInferMeta(const MetaTensor& x,
-                           const MetaTensor& index,
-                           const MetaTensor& out_grad,
-                           MetaTensor* x_grad) {
-  const auto& dtype = out_grad.dtype();
-  x_grad->set_dims(x.dims());
-  x_grad->share_lod(x);
-  x_grad->set_dtype(dtype);
+void MaxPoolWithIndexGradInferMeta(const MetaTensor& x,
+                                   const MetaTensor& mask,
+                                   const MetaTensor& dout,
+                                   const std::vector<int>& kernel_size,
+                                   const std::vector<int>& strides,
+                                   const std::vector<int>& paddings,
+                                   bool global_pooling,
+                                   bool adaptive,
+                                   MetaTensor* dx) {
+  dx->share_meta(x);
+}
+
+void PoolGradInferMeta(const MetaTensor& x,
+                       const MetaTensor& out,
+                       const MetaTensor& dout,
+                       const std::vector<int>& kernel_size,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       bool ceil_mode,
+                       bool exclusive,
+                       const std::string& data_format,
+                       const std::string& pooling_type,
+                       bool global_pooling,
+                       bool adaptive,
+                       const std::string& padding_algorithm,
+                       MetaTensor* dx) {
+  dx->share_meta(x);
+}
+
+void PsroiPoolGradInferMeta(const MetaTensor& x,
+                            const MetaTensor& rois,
+                            paddle::optional<const MetaTensor&> rois_num,
+                            const MetaTensor& dout,
+                            int pooled_height,
+                            int pooled_width,
+                            int output_channels,
+                            float spatial_scale,
+                            MetaTensor* dx) {
+  dx->share_meta(x);
 }
 
 void ScatterGradInferMeta(const MetaTensor& index,
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index f7b0eed5dd929..5c49a58a715a4 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -17,10 +17,17 @@ limitations under the License. */
 #include <tuple>
 
 #include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+#include "paddle/phi/infermeta/multiary.h"
+#include "paddle/phi/infermeta/ternary.h"
 #include "paddle/phi/infermeta/unary.h"
 
 namespace phi {
 
+// Common InferMeta Functions for backward operators.
+//
+// NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
+
 void BilinearTensorProductGradInferMeta(const MetaTensor& x,
                                         const MetaTensor& y,
                                         const MetaTensor& weight,
@@ -30,6 +37,42 @@ void BilinearTensorProductGradInferMeta(const MetaTensor& x,
                                         MetaTensor* dweight,
                                         MetaTensor* dbias);
 
+void ConvTransposeGradInferMeta(const MetaTensor& x,
+                                const MetaTensor& filter,
+                                const MetaTensor& dout,
+                                const std::vector<int>& strides,
+                                const std::vector<int>& paddings,
+                                const std::vector<int>& output_padding,
+                                const std::vector<int>& output_size,
+                                const std::string& padding_algorithm,
+                                int groups,
+                                const std::vector<int>& dilations,
+                                const std::string& data_format,
+                                MetaTensor* dx,
+                                MetaTensor* dfilter);
+
+void Conv2dTransposeDoubleGradInferMeta(const MetaTensor& x,
+                                        const MetaTensor& filter,
+                                        const MetaTensor& dout,
+                                        const MetaTensor& ddx,
+                                        const MetaTensor& ddfilter,
+                                        const std::vector<int>& strides,
+                                        const std::vector<int>& paddings,
+                                        const std::vector<int>& output_padding,
+                                        const std::vector<int>& output_size,
+                                        const std::string& padding_algorithm,
+                                        int groups,
+                                        const std::vector<int>& dilations,
+                                        const std::string& data_format,
+                                        MetaTensor* dx,
+                                        MetaTensor* dfilter,
+                                        MetaTensor* ddout);
+
+void GatherNdGradInferMeta(const MetaTensor& x,
+                           const MetaTensor& index,
+                           const MetaTensor& out_grad,
+                           MetaTensor* x_grad);
+
 void GeneralBinaryGradInferMeta(const MetaTensor& x,
                                 const MetaTensor& y,
                                 MetaTensor* dx,
@@ -42,11 +85,48 @@ void GeneralTernaryGradInferMeta(const MetaTensor& x,
                                  MetaTensor* dy,
                                  MetaTensor* dz);
 
+void GeneralUnaryGradInferMeta(const MetaTensor& x, MetaTensor* dx);
+
 void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
                                 const MetaTensor& dout,
                                 int axis,
                                 MetaTensor* dx);
 
+void MaxPoolWithIndexGradInferMeta(const MetaTensor& x,
+                                   const MetaTensor& mask,
+                                   const MetaTensor& dout,
+                                   const std::vector<int>& kernel_size,
+                                   const std::vector<int>& strides,
+                                   const std::vector<int>& paddings,
+                                   bool global_pooling,
+                                   bool adaptive,
+                                   MetaTensor* dx);
+
+void PsroiPoolGradInferMeta(const MetaTensor& x,
+                            const MetaTensor& rois,
+                            paddle::optional<const MetaTensor&> rois_num,
+                            const MetaTensor& dout,
+                            int pooled_height,
+                            int pooled_width,
+                            int output_channels,
+                            float spatial_scale,
+                            MetaTensor* dx);
+
+void PoolGradInferMeta(const MetaTensor& x,
+                       const MetaTensor& out,
+                       const MetaTensor& dout,
+                       const std::vector<int>& kernel_size,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       bool ceil_mode,
+                       bool exclusive,
+                       const std::string& data_format,
+                       const std::string& pooling_type,
+                       bool global_pooling,
+                       bool adaptive,
+                       const std::string& padding_algorithm,
+                       MetaTensor* dx);
+
 void ScatterGradInferMeta(const MetaTensor& index,
                           const MetaTensor& updates,
                           const MetaTensor& out_grad,
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index b17405990fb72..36a049eca0f30 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -17,10 +17,257 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 
+#include "paddle/phi/kernels/cpu/conv_util.h"
+
 namespace phi {
+namespace detail {
+
+static void BinarySameInputDimsCheck(const MetaTensor& x,
+                                     const MetaTensor& y,
+                                     MetaConfig config) {
+  auto input_dim = x.dims();
+  auto other_dim = y.dims();
+  PADDLE_ENFORCE_EQ(input_dim.size(),
+                    other_dim.size(),
+                    phi::errors::PreconditionNotMet(
+                        "Input(Input) and Input(Other) must have the same "
+                        "dimension size."));
+  int n = input_dim.size();
+  bool is_runtime = config.is_runtime;
+  for (int i = 0; i < n; i++) {
+    if (is_runtime) {
+      PADDLE_ENFORCE_EQ(input_dim[i],
+                        other_dim[i],
+                        phi::errors::PreconditionNotMet(
+                            "The value at dim %d of Input(Input) is not "
+                            "equal to the Input(Other): %ld != %ld.",
+                            i,
+                            input_dim[i],
+                            other_dim[i]));
+    } else {
+      if (!(input_dim[i] < 0 || other_dim[i] < 0)) {
+        PADDLE_ENFORCE_EQ(input_dim[i],
+                          other_dim[i],
+                          phi::errors::PreconditionNotMet(
+                              "The value at dim %d of Input(Input) is not "
+                              "equal to the Input(Other): %ld != %ld.",
+                              i,
+                              input_dim[i],
+                              other_dim[i]));
+      }
+    }
+  }
+}
+
+}  // namespace detail
+
+void AllValueCompareInferMeta(const MetaTensor& x,
+                              const MetaTensor& y,
+                              MetaTensor* out,
+                              MetaConfig config) {
+  detail::BinarySameInputDimsCheck(x, y, config);
+
+  out->set_dims(phi::make_ddim({1}));
+  out->set_dtype(DataType::BOOL);
+}
+
+void KLDivInferMeta(const MetaTensor& x,
+                    const MetaTensor& label,
+                    const std::string& reduction,
+                    MetaTensor* out,
+                    MetaConfig config) {
+  auto dim_x = x.dims();
+  auto dim_target = label.dims();
+  PADDLE_ENFORCE_EQ(dim_x.size(),
+                    dim_target.size(),
+                    phi::errors::InvalidArgument(
+                        "Input(X) rank and Input(Target) rank should be "
+                        "same, but received X rank(%d) != Target rank(%d)",
+                        dim_x.size(),
+                        dim_target.size()));
+  for (int i = 0; i < dim_x.size(); i++) {
+    if (config.is_runtime || (dim_x[i] > 0 && dim_target[i] > 0)) {
+      PADDLE_ENFORCE_EQ(
+          dim_x[i],
+          dim_target[i],
+          phi::errors::InvalidArgument(
+              "Input(X) and Input(Target) should in same shape. but received "
+              "X dimension[%d](%d) != Target dimension[%d](%d)",
+              i,
+              dim_x[i],
+              i,
+              dim_target[i]));
+    }
+  }
+
+  auto reduction_valid = "mean" == reduction || "sum" == reduction ||
+                         "batchmean" == reduction || "none" == reduction;
+  PADDLE_ENFORCE_EQ(
+      reduction_valid,
+      true,
+      phi::errors::InvalidArgument(
+          "Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'."));
+
+  if ("none" == reduction) {
+    out->set_dims(dim_x);
+  } else {
+    out->set_dims({1});
+  }
+  out->set_dtype(x.dtype());
+}
+
+void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
+  out->share_meta(x);
+}
+
+void BCELossInferMeta(const MetaTensor& input,
+                      const MetaTensor& label,
+                      MetaTensor* out,
+                      MetaConfig config) {
+  auto input_dims = input.dims();
+  auto label_dims = label.dims();
+
+  int rank = input_dims.size();
+  PADDLE_ENFORCE_EQ(rank,
+                    label_dims.size(),
+                    phi::errors::InvalidArgument(
+                        "Input(X) and Input(Label) shall have the same rank."
+                        "But received: the rank of Input(X) is [%d], "
+                        "the rank of Input(Label) is [%d].",
+                        rank,
+                        label_dims.size()));
+
+  bool check = true;
+  if ((!config.is_runtime) &&
+      (phi::product(input_dims) <= 0 || phi::product(label_dims) <= 0)) {
+    check = false;
+  }
+
+  if (check) {
+    PADDLE_ENFORCE_EQ(input_dims,
+                      label_dims,
+                      phi::errors::InvalidArgument(
+                          "Input(X) and Input(Label) shall have the same "
+                          "shape. But received: the shape of Input(X) is "
+                          "[%s], the shape of Input(Label) is [%s].",
+                          input_dims,
+                          label_dims));
+  }
+
+  out->set_dims(input_dims);
+  out->set_dtype(input.dtype());
+  out->share_lod(input);
+}
+
+void BincountInferMeta(const MetaTensor& x,
+                       const paddle::optional<const MetaTensor&> weights,
+                       int minlength,
+                       MetaTensor* out) {
+  auto input_dim = x.dims();
+
+  PADDLE_ENFORCE_GE(minlength,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The minlength should be greater than or equal to 0."
+                        "But received minlength is %d",
+                        minlength));
+
+  PADDLE_ENFORCE_EQ(
+      input_dim.size(),
+      1,
+      phi::errors::InvalidArgument("The 'shape' of Input(X) must be 1-D tensor."
+                                   "But the dimension of Input(X) is [%d]",
+                                   input_dim.size()));
+
+  if (weights.is_initialized()) {
+    auto weights_dim = weights->dims();
+    PADDLE_ENFORCE_EQ(weights_dim.size(),
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The 'shape' of Input(Weights) must be 1-D tensor."
+                          "But the dimension of Input(Weights) is [%d]",
+                          weights_dim.size()));
+
+    PADDLE_ENFORCE_EQ(
+        weights_dim[0],
+        input_dim[0],
+        phi::errors::InvalidArgument(
+            "The 'shape' of Input(Weights) must be equal to the 'shape' of "
+            "Input(X)."
+            "But received: the 'shape' of Input(Weights) is [%s],"
+            "the 'shape' of Input(X) is [%s]",
+            weights_dim,
+            input_dim));
+  }
+  out->set_dims(phi::make_ddim({-1}));
+  if (weights.is_initialized()) {
+    out->set_dtype(weights->dtype());
+  } else {
+    out->set_dtype(x.dtype());
+  }
+
+  out->share_lod(x);
+}
+
+void CholeskySolveInferMeta(const MetaTensor& x,
+                            const MetaTensor& y,
+                            bool upper,
+                            MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  auto x_dims_n = x_dims.size();
+  auto y_dims_n = y_dims.size();
+
+  PADDLE_ENFORCE_GE(x_dims_n,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "the rank of input Y must greater or equal to 2"));
+  PADDLE_ENFORCE_GE(y_dims_n,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "the rank of input X must greater or equal to 2"));
+  PADDLE_ENFORCE_EQ(
+      y_dims[y_dims_n - 1],
+      y_dims[y_dims_n - 2],
+      phi::errors::InvalidArgument("input Matrix Y should be square matrix,"
+                                   "But Got last shape of %ld x %ld",
+                                   y_dims[y_dims_n - 1],
+                                   y_dims[y_dims_n - 2]));
+  PADDLE_ENFORCE_EQ(
+      x_dims[x_dims_n - 2],
+      y_dims[y_dims_n - 2],
+      phi::errors::InvalidArgument("the first dim of Matrix X must be equal to "
+                                   "the fisrt dim of Matrix Y,"
+                                   "But Got %ld and %ld",
+                                   x_dims[x_dims_n - 2],
+                                   y_dims[y_dims_n - 2]));
+
+  std::vector<int64_t> x_dims_vec = phi::vectorize(x_dims);
+  std::vector<int64_t> y_dims_vec = phi::vectorize(y_dims);
+
+  std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2);
+  std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2);
+
+  std::vector<int64_t> expand_batch_portion =
+      funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut);
+
+  std::vector<int64_t> x_broadcast_dims({expand_batch_portion});
+  x_broadcast_dims.insert(x_broadcast_dims.end(),
+                          {x_dims_vec[x_dims_n - 2], x_dims_vec[x_dims_n - 1]});
+
+  // dim of 'out' is the same with 'X' after broadcast
+  out->set_dims(phi::make_ddim(x_broadcast_dims));
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+}
 
 void CompareInferMeta(const MetaTensor& x,
                       const MetaTensor& y,
@@ -67,6 +314,419 @@ void CompareAllInferMeta(const MetaTensor& x,
   out->set_dtype(DataType::BOOL);
 }
 
+void ConvInferMeta(const MetaTensor& input,
+                   const MetaTensor& filter,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings_t,
+                   const std::string& padding_algorithm,
+                   int groups,
+                   const std::vector<int>& dilations_t,
+                   const std::string& data_format,
+                   bool use_addto,
+                   int workspace_size_MB,
+                   bool exhaustive_search,
+                   MetaTensor* out,
+                   MetaConfig config) {
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+  auto in_dims = input.dims();
+  auto filter_dims = filter.dims();
+  int dilation_size = dilations.size();
+  for (int i = 0; i < dilation_size; ++i) {
+    PADDLE_ENFORCE_GT(
+        dilations[i],
+        0,
+        phi::errors::InvalidArgument(
+            "The dilation of Op(Conv) should be larget than 0, but received "
+            "dilation is %d.",
+            dilations[i]));
+  }
+  const bool channel_last = (config.is_run_mkldnn_kernel == false) &&
+                            (data_format == "NHWC" || data_format == "NDHWC");
+
+  PADDLE_ENFORCE_EQ(
+      in_dims.size() == 4 || in_dims.size() == 5,
+      true,
+      phi::errors::InvalidArgument(
+          "The input of Op(Conv) should be a 4-D or 5-D Tensor. But "
+          "received: input's dimension is %u, input's shape is [%s].",
+          in_dims.size(),
+          in_dims));
+
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(),
+      filter_dims.size(),
+      phi::errors::InvalidArgument(
+          "The input's dimension and filter's dimension of "
+          "Op(Conv) should be equal. But received: the input's shape is [%s], "
+          "the input's dimension is %d; the filter's shape is [%s],  "
+          "the filter's dimension is %d.",
+          in_dims,
+          in_dims.size(),
+          filter_dims,
+          filter_dims.size()));
+
+  int stride_size = strides.size();
+  for (int i = 0; i < stride_size; ++i) {
+    PADDLE_ENFORCE_GT(
+        strides[i],
+        0,
+        phi::errors::InvalidArgument(
+            "The stride of Op(Conv) should be larget than 0, but received "
+            "stride is %d.",
+            strides[i]));
+  }
+
+  int in_sub_stride_size = in_dims.size() - stride_size;
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(),
+      strides.size() + 2U,
+      phi::errors::InvalidArgument(
+          "The difference of input's dimension and Attr(strides)'s "
+          "length must be euqal to 2 for Op(Conv). "
+          "But received: input's dimension is %d, input's shape is [%s]; "
+          "Attr(stride)'s length is %d, Attr(stride) is [%s]; "
+          "difference of input's dimention and Attr(strides)'s length = %u.",
+          in_dims.size(),
+          in_dims,
+          strides.size(),
+          phi::make_ddim(strides),
+          in_sub_stride_size));
+
+  const auto input_channels =
+      channel_last ? in_dims[in_dims.size() - 1] : in_dims[1];
+
+  PADDLE_ENFORCE_EQ(
+      input_channels,
+      filter_dims[1] * groups,
+      phi::errors::InvalidArgument(
+          "The number of input's channels should be equal to filter's channels "
+          "* groups for Op(Conv). But received: the input's channels is %d, "
+          "the input's shape is [%s]; the filter's channels is %d, the "
+          "filter's shape is [%s]; the groups is %d, the data_format is %s. "
+          "The error may come from wrong data_format setting.",
+          input_channels,
+          in_dims,
+          filter_dims[1],
+          filter_dims,
+          groups,
+          data_format));
+  PADDLE_ENFORCE_EQ(
+      filter_dims[0] % groups,
+      0,
+      phi::errors::InvalidArgument(
+          "The number of output's channels (filter's first dimension) of "
+          "Op(Conv) should be divided by groups. But received: "
+          "the output channels is %d, the filter's shape is [%s], "
+          "the groups is %d.",
+          filter_dims[0],
+          filter_dims,
+          groups));
+
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_GT(
+        filter_dims[0],
+        0,
+        phi::errors::InvalidArgument(
+            "the size of filter at axis 0 should be greater than 0"));
+  }
+
+  DDim in_data_dims;
+  if (channel_last) {
+    in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+  } else {
+    in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
+  }
+
+  DDim filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
+
+  std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+  phi::UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  std::vector<int64_t> output_shape({in_dims[0]});
+  if (!channel_last) {
+    output_shape.push_back(filter_dims[0]);
+  }
+  for (int i = 0; i < in_data_dims.size(); ++i) {
+    if ((!config.is_runtime) &&
+        (in_data_dims[i] <= 0 || filter_dims[i + 2] <= 0)) {
+      output_shape.push_back(-1);
+    } else {
+      const int dkernel = dilations[i] * (filter_data_dims[i] - 1) + 1;
+      int output_size =
+          (in_data_dims[i] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) /
+              strides[i] +
+          1;
+      output_shape.push_back(output_size);
+    }
+  }
+  if (channel_last) {
+    output_shape.push_back(filter_dims[0]);
+  }
+
+  out->set_dims(make_ddim(output_shape));
+  out->set_dtype(input.dtype());
+}
+
+void ConvTransposeInferMeta(const MetaTensor& x,
+                            const MetaTensor& filter,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            const std::vector<int>& output_padding,
+                            const std::vector<int>& output_size,
+                            const std::string& padding_algorithm,
+                            int groups,
+                            const std::vector<int>& dilations,
+                            const std::string& data_format,
+                            MetaTensor* out,
+                            MetaConfig config) {
+  auto x_dims = x.dims();
+  auto filter_dims = filter.dims();
+
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ = dilations;
+
+  const DataLayout data_layout =
+      config.is_run_mkldnn_kernel
+          ? DataLayout::kNCHW
+          : paddle::framework::StringToDataLayout(data_format);
+
+  PADDLE_ENFORCE_EQ(
+      x_dims.size() == 4 || x_dims.size() == 5,
+      true,
+      errors::InvalidArgument("Input of Op(conv_transpose) should be 4-D or "
+                              "5-D Tensor. But received: %u-D Tensor, "
+                              "the shape of input is [%s]",
+                              x_dims.size(),
+                              x_dims));
+  PADDLE_ENFORCE_EQ(
+      x_dims.size(),
+      filter_dims.size(),
+      errors::InvalidArgument(
+          "The input's dimension size and filter's dimension size of "
+          "Op (conv_transpose) should be equal. But received: the shape of "
+          "input is [%s], the dimension size of input is [%d], the shape "
+          "of filter is [%s],  the dimension size of filter is [%d]. ",
+          x_dims,
+          x_dims.size(),
+          filter_dims,
+          filter_dims.size()));
+
+  int stride_size = strides.size();
+  for (int i = 0; i < stride_size; ++i) {
+    PADDLE_ENFORCE_GT(
+        strides[i],
+        0,
+        errors::InvalidArgument(
+            "The stride of Op(Conv) should be larget than 0, but received "
+            "stride is %d.",
+            strides[i]));
+  }
+
+  int in_sub_stride_size = x_dims.size() - stride_size;
+
+  PADDLE_ENFORCE_EQ(
+      x_dims.size() - strides.size(),
+      2U,
+      errors::InvalidArgument(
+          "The input's dimension size minus Attr(stride)'s size must "
+          "be euqal to 2 for Op(conv_transpose). But received: [%d], the "
+          "input's dimension size is [%d], the shape of input "
+          "is [%s], the Attr(stride)'s size is [%d].",
+          in_sub_stride_size,
+          x_dims.size(),
+          x_dims,
+          strides.size()));
+  if (output_size.size())
+    PADDLE_ENFORCE_EQ(
+        output_size.size(),
+        strides.size(),
+        errors::InvalidArgument(
+            "The Attr(output_size) and Attr(stride) of Op(conv_transpose) "
+            "should be the same."));
+  if (output_padding.size())
+    PADDLE_ENFORCE_EQ(
+        output_padding.size(),
+        strides.size(),
+        errors::InvalidArgument(
+            "The Attr(output_padding) and Attr(stride) of Op(conv_transpose) "
+            "should be the same."));
+
+  const int64_t C =
+      (data_layout != DataLayout::kNHWC ? x_dims[1]
+                                        : x_dims[x_dims.size() - 1]);
+  PADDLE_ENFORCE_EQ(
+      C,
+      filter_dims[0],
+      errors::InvalidArgument(
+          "The number of input channels should be equal to filter channels "
+          "for Op(conv_transpose). But received: the input's channels is "
+          "[%d], the shape of input is [%s], the filter's channels is [%d], "
+          "the shape of filter is [%s]. The data_format is %s."
+          "The error may come from wrong data_format setting.",
+          C,
+          x_dims,
+          filter_dims[0],
+          filter_dims,
+          data_format));
+
+  DDim x_data_dims;
+  if (data_layout != DataLayout::kNHWC) {
+    x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  } else {
+    x_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  }
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
+
+  std::vector<int64_t> output_shape({x_dims[0]});
+  if (data_layout != DataLayout::kNHWC) {
+    output_shape.push_back(filter_dims[1] * groups);
+  }
+  const int offset = (data_layout != DataLayout::kNHWC ? 2 : 1);
+  for (size_t i = 0; i < strides.size(); ++i) {
+    auto filter_extent = dilations_[i] * (filter_dims[i + 2] - 1) + 1;
+    auto infer_shape = (config.is_runtime || x_dims[i + offset] > 0)
+                           ? (x_dims[i + offset] - 1) * strides[i] -
+                                 paddings_[2 * i] - paddings_[2 * i + 1] +
+                                 filter_extent
+                           : -1;
+    if (output_size.size()) {
+      if (config.is_runtime) {
+        PADDLE_ENFORCE_GE(
+            output_size[i],
+            infer_shape,
+            errors::InvalidArgument(
+                "output_size of Op(ConvTransposeOp) should not be "
+                "less than the infered output size. But received output_size = "
+                "[%s], whose dim %d is less than the infered output size [%s]",
+                make_ddim(output_size).to_str(),
+                i,
+                infer_shape));
+        PADDLE_ENFORCE_LT(
+            output_size[i],
+            infer_shape + strides[i],
+            errors::InvalidArgument(
+                "output_size of Op(ConvTransposeOp) should be less "
+                "than infered size + stride. But received output_size = [%s], "
+                "whose dim %d is not less than the infered output size (%d) + "
+                "stride (%d) = %d",
+                make_ddim(output_size).to_str(),
+                i,
+                infer_shape,
+                strides[i],
+                infer_shape + strides[i]));
+      }
+      output_shape.push_back(output_size[i]);
+    } else if (output_padding.size()) {
+      if (config.is_runtime) {
+        PADDLE_ENFORCE_GE(
+            output_padding[i],
+            0,
+            errors::InvalidArgument(
+                "output_padding of Op(ConvTransposeOp) should not be "
+                "less than the 0. But received output_padding = "
+                "[%s], whose dim %d is less than 0",
+                make_ddim(output_padding).to_str(),
+                i));
+        PADDLE_ENFORCE_LT(
+            output_padding[i],
+            std::max(strides[i], dilations_[i]),
+            errors::InvalidArgument(
+                "output_padding of Op(ConvTransposeOp) should be less "
+                "than either stride or dilation. But received output_size = "
+                "[%s], "
+                "whose dim %d is not less than either stride (%d)  or "
+                "dilation (%d)",
+                make_ddim(output_size).to_str(),
+                i,
+                strides[i],
+                dilations_[i]));
+      }
+      output_shape.push_back((infer_shape + output_padding[i]));
+    } else {
+      output_shape.push_back(infer_shape);
+    }
+  }
+  if (data_layout == DataLayout::kNHWC) {
+    output_shape.push_back(filter_dims[1] * groups);
+  }
+
+  out->set_dims(make_ddim(output_shape));
+  out->set_dtype(x.dtype());
+}
+
+void CrossInferMeta(const MetaTensor& x,
+                    const MetaTensor& y,
+                    int axis,
+                    MetaTensor* out) {
+  auto x_dim = x.dims();
+  auto y_dim = y.dims();
+  auto dim = axis;
+
+  bool dims_match = phi::funcs::CheckDims(x_dim, y_dim);
+  PADDLE_ENFORCE_EQ(
+      dims_match,
+      true,
+      phi::errors::InvalidArgument("The 'shape' of Input(X) should be equal to "
+                                   "the 'shape' of Input(Y). But received "
+                                   "Input(X).dimensions = [%s], "
+                                   "Input(Y).dimensions = [%s]",
+                                   x_dim,
+                                   y_dim));
+
+  if (dim != DDim::kMaxRank) {
+    PADDLE_ENFORCE_EQ(
+        dim < x_dim.size() && dim >= (0 - x_dim.size()),
+        true,
+        phi::errors::OutOfRange(
+            "Attr(dim) is out of range, It's expected "
+            "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
+            x_dim.size(),
+            x_dim.size() - 1,
+            dim));
+    if (dim < 0) {
+      dim += x_dim.size();
+    }
+    PADDLE_ENFORCE_EQ(x_dim[dim] == 3 && y_dim[dim] == 3,
+                      true,
+                      phi::errors::InvalidArgument(
+                          "Input(X/Y).dims()[dim] should be equal to 3."
+                          "But received Input(X/Y).dims()[dim] = %d.",
+                          x_dim[dim]));
+  }
+  out->set_dims(x_dim);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+}
+
+void DistInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   float p,
+                   MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  PADDLE_ENFORCE_NE(phi::product(x_dims),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The Input(X) has not been initialized properly. The "
+                        "shape of Input(X) = [%s].",
+                        x_dims));
+  PADDLE_ENFORCE_NE(phi::product(y_dims),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The Input(Y) has not been initialized properly. The "
+                        "shape of Input(Y) = [%s].",
+                        y_dims));
+  out->set_dims({1});
+  out->set_dtype(x.dtype());
+}
+
 void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
   auto x_dims = x.dims();
   auto x_rank = static_cast<size_t>(x_dims.size());
@@ -109,79 +769,6 @@ void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
   out->set_layout(x.layout());
 }
 
-void MatmulInferMeta(const MetaTensor& x,
-                     const MetaTensor& y,
-                     bool trans_x,
-                     bool trans_y,
-                     MetaTensor* out) {
-  std::vector<int64_t> dims_x = phi::vectorize(x.dims());
-  std::vector<int64_t> dims_y = phi::vectorize(y.dims());
-  auto ndims_x = dims_x.size();
-  auto ndims_y = dims_y.size();
-  PADDLE_ENFORCE_GT(ndims_x,
-                    0UL,
-                    phi::errors::InvalidArgument(
-                        "The Input(x) dims size must be greater than 0,"
-                        " but reviced dims size is 0. "));
-  PADDLE_ENFORCE_GT(ndims_y,
-                    0UL,
-                    phi::errors::InvalidArgument(
-                        "The Input(y) dims size must be greater than 0,"
-                        " but reviced dims size is 0. "));
-
-  bool x_broadcasted = false, y_broadcasted = false;
-  if (ndims_x == 1) {
-    dims_x.insert(dims_x.begin(), 1);
-    ndims_x = 2;
-    x_broadcasted = true;
-  }
-
-  if (ndims_y == 1) {
-    dims_y.push_back(1);
-    ndims_y = 2;
-    y_broadcasted = true;
-  }
-
-  size_t M, N;
-  if (trans_x) {
-    M = dims_x[ndims_x - 1];
-  } else {
-    M = dims_x[ndims_x - 2];
-  }
-  if (trans_y) {
-    N = dims_y[ndims_y - 2];
-  } else {
-    N = dims_y[ndims_y - 1];
-  }
-
-  std::vector<int64_t> new_dims;
-  if (ndims_x > ndims_y) {
-    new_dims.assign(dims_x.begin(), dims_x.end() - 2);
-  } else if (ndims_x < ndims_y) {
-    new_dims.assign(dims_y.begin(), dims_y.end() - 2);
-  } else {
-    new_dims.reserve(ndims_x);
-    for (size_t i = 0; i < ndims_x - 2; ++i) {
-      new_dims.push_back(std::max(dims_x[i], dims_y[i]));
-    }
-  }
-  if (!x_broadcasted) {
-    new_dims.push_back(M);
-  }
-  if (!y_broadcasted) {
-    new_dims.push_back(N);
-  }
-  if (x_broadcasted && y_broadcasted) {
-    new_dims.push_back(1);
-  }
-
-  auto ddim_out = phi::make_ddim(new_dims);
-
-  out->set_dims(ddim_out);
-  out->set_dtype(x.dtype());
-  out->set_layout(x.layout());
-}
-
 void ElementwiseInferMeta(const MetaTensor& x,
                           const MetaTensor& y,
                           MetaTensor* out) {
@@ -234,7 +821,171 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
   }
 
   out->set_dtype(x.dtype());
-  out->set_layout(x.layout());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+}
+
+void ExpandAsInferMeta(const MetaTensor& x,
+                       paddle::optional<const MetaTensor&> y,
+                       const std::vector<int>& target_shape,
+                       MetaTensor* out) {
+#define MAX_RANK_SUPPORTED 6
+  auto x_dims = x.dims();
+  PADDLE_ENFORCE_GE(
+      target_shape.size(),
+      static_cast<size_t>(x_dims.size()),
+      phi::errors::InvalidArgument(
+          "The rank of target_shape must be greater than or equal "
+          "to the rank of Input(X). But received Input(X): input "
+          "rank %u; received target_shape: rank %u.",
+          x_dims.size(),
+          target_shape.size()));
+  PADDLE_ENFORCE_LE(target_shape.size(),
+                    MAX_RANK_SUPPORTED,
+                    phi::errors::InvalidArgument(
+                        "The rank of target_shape must be less than or equal "
+                        "to %d. But received: rank %u.",
+                        MAX_RANK_SUPPORTED,
+                        target_shape.size()));
+  out->set_dims(phi::make_ddim(target_shape));
+  out->set_dtype(x.dtype());
+#undef MAX_RANK_SUPPORTED
+}
+
+void GatherInferMeta(const MetaTensor& x,
+                     const MetaTensor& index,
+                     const Scalar& axis,
+                     MetaTensor* out) {
+  auto index_dims = index.dims();
+
+  if (index_dims.size() == 2) {
+    PADDLE_ENFORCE_EQ(
+        index_dims[1],
+        1,
+        phi::errors::InvalidArgument(
+            "The last dim of index should be 1 when it is 2D, but we get %d",
+            index_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        index_dims.size(),
+        1,
+        phi::errors::InvalidArgument(
+            "The index should be 1D, when it is not 2D, but we get %d",
+            index_dims.size()));
+  }
+
+  auto input_dim = x.dims();
+  auto axis_v = axis.to<int>();
+  if (axis.FromTensor() || axis_v == 0) {
+    // if axis.FromTensor(), we can not obtain correct shape of output
+    int batch_size = index_dims[0];
+    phi::DDim output_dims(input_dim);
+    output_dims[0] = batch_size;
+    out->set_dims(output_dims);
+    out->set_dtype(x.dtype());
+    out->share_lod(x);
+  } else {
+    int index_size = index_dims[0];
+    std::vector<int> out_dim_vec;
+    for (int i = 0; i < axis_v; i++) {
+      out_dim_vec.push_back(input_dim[i]);
+    }
+    out_dim_vec.push_back(index_size);
+    for (int i = axis_v + 1; i < input_dim.size(); i++) {
+      out_dim_vec.push_back(input_dim[i]);
+    }
+    auto output_dims = phi::make_ddim(out_dim_vec);
+    out->set_dims(output_dims);
+    out->set_dtype(x.dtype());
+    out->share_lod(x);
+  }
+}
+
+void GatherNdInferMeta(const MetaTensor& x,
+                       const MetaTensor& index,
+                       MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto x_dims_size = x_dims.size();
+  auto index_dims = index.dims();
+  auto index_dims_size = index_dims.size();
+
+  PADDLE_ENFORCE_LE(
+      index_dims[index_dims_size - 1],
+      x_dims_size,
+      phi::errors::InvalidArgument(
+          "Input(Index).shape[-1] should be no greater than Input(X).rank"));
+  PADDLE_ENFORCE_GE(index_dims_size,
+                    1UL,
+                    phi::errors::InvalidArgument(
+                        "The rank of Input(Index) should be greater than 1"));
+
+  std::vector<int64_t> result_dims;
+  // The result dims is
+  //   Index.shape[:-1] + X.shape[Index.shape[-1]:]
+  for (int i = 0; i < index_dims_size - 1; ++i) {
+    result_dims.emplace_back(index_dims[i]);
+  }
+  for (int i = index_dims[index_dims_size - 1]; i < x_dims_size; ++i) {
+    result_dims.emplace_back(x_dims[i]);
+  }
+
+  out->set_dims(phi::make_ddim(result_dims));
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
+void GatherTreeMeta(const MetaTensor& ids,
+                    const MetaTensor& parents,
+                    MetaTensor* out) {
+  auto ids_dims = ids.dims();
+  auto parents_dims = parents.dims();
+  PADDLE_ENFORCE_EQ(ids_dims == parents_dims,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "The shape of Input(Parents) must be same with the "
+                        "shape of Input(Ids)."));
+  out->set_dims(ids_dims);
+}
+
+void GridSampleBaseInferMeta(const MetaTensor& x,
+                             const MetaTensor& grid,
+                             MetaTensor* out,
+                             MetaConfig config) {
+  auto x_dims = x.dims();
+  auto grid_dims = grid.dims();
+  PADDLE_ENFORCE_EQ(x_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "Input(X) of GridSampleOp should be 4-D Tensor, but "
+                        "received X dimension size(%d)",
+                        x_dims.size()));
+  PADDLE_ENFORCE_EQ(grid_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "Input(Grid) of GridSampleOp should be 4-D Tensor, "
+                        "but received X dimension size(%d)",
+                        grid_dims.size()));
+  if (config.is_runtime || grid_dims[3] > 0) {
+    PADDLE_ENFORCE_EQ(
+        grid_dims[3],
+        2,
+        phi::errors::InvalidArgument(
+            "Input(Grid) dimension[3] should be 2, but received %d",
+            grid_dims[3]));
+  }
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(
+        grid_dims[0],
+        x_dims[0],
+        phi::errors::InvalidArgument(
+            "Input(X) and Input(Grid) dimension[0] should be equal, but "
+            "received X dimension[0](%d) != Grid dimension[0](%d)",
+            x_dims[0],
+            grid_dims[0]));
+  }
+
+  out->set_dims({x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]});
+  out->set_dtype(x.dtype());
   out->share_lod(x);
 }
 
@@ -274,65 +1025,6 @@ void HuberLossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
-void TriangularSolveInferMeta(const MetaTensor& x,
-                              const MetaTensor& y,
-                              bool upper,
-                              bool transpose,
-                              bool unitriangular,
-                              MetaTensor* out) {
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-
-  auto x_dims_n = x_dims.size();
-  auto y_dims_n = y_dims.size();
-
-  PADDLE_ENFORCE_GE(x_dims_n,
-                    2,
-                    phi::errors::InvalidArgument(
-                        "The input tensor X's dimensions of TriangularSolveOp "
-                        "should be >= 2. But received X's "
-                        "dimensions = %d, X's shape = [%s]",
-                        x_dims.size(),
-                        x_dims));
-
-  PADDLE_ENFORCE_GE(y_dims_n,
-                    2,
-                    phi::errors::InvalidArgument(
-                        "The input tensor Y's dimensions of TriangularSolveOp "
-                        "should be >=2. But received Y's "
-                        "dimensions = %d, Y's shape = [%s]",
-                        y_dims.size(),
-                        y_dims));
-
-  PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2],
-                    x_dims[x_dims_n - 1],
-                    phi::errors::InvalidArgument(
-                        "The inner-most 2 dimensions of Input(X) all should "
-                        "be square matrices "
-                        "But received X's shape[-2] = %d and shape[-1] = %d.",
-                        x_dims[x_dims_n - 2],
-                        x_dims[x_dims_n - 1]));
-
-  std::vector<int64_t> x_dims_vec = phi::vectorize(x_dims);
-  std::vector<int64_t> y_dims_vec = phi::vectorize(y_dims);
-
-  std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2);
-  std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2);
-
-  std::vector<int64_t> expand_batch_portion =
-      funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut);
-
-  std::vector<int64_t> y_broadcast_dims({expand_batch_portion});
-  y_broadcast_dims.insert(y_broadcast_dims.end(),
-                          {y_dims_vec[y_dims_n - 2], y_dims_vec[y_dims_n - 1]});
-
-  // dim of 'out' is the same with 'Y' after broadcast
-  out->set_dims(phi::make_ddim(y_broadcast_dims));
-  out->set_dtype(y.dtype());
-  out->set_layout(y.layout());
-  out->share_lod(y);
-}
-
 void IndexSampleInferMeta(const MetaTensor& x,
                           const MetaTensor& y,
                           MetaTensor* out,
@@ -368,211 +1060,66 @@ void IndexSampleInferMeta(const MetaTensor& x,
   out->set_dims(index_dims);
   out->share_lod(y);
 }
-void CrossInferMeta(const MetaTensor& x,
-                    const MetaTensor& y,
-                    int axis,
-                    MetaTensor* out) {
-  auto x_dim = x.dims();
-  auto y_dim = y.dims();
-  auto dim = axis;
-
-  bool dims_match = phi::funcs::CheckDims(x_dim, y_dim);
-  PADDLE_ENFORCE_EQ(
-      dims_match,
-      true,
-      phi::errors::InvalidArgument("The 'shape' of Input(X) should be equal to "
-                                   "the 'shape' of Input(Y). But received "
-                                   "Input(X).dimensions = [%s], "
-                                   "Input(Y).dimensions = [%s]",
-                                   x_dim,
-                                   y_dim));
-
-  if (dim != DDim::kMaxRank) {
-    PADDLE_ENFORCE_EQ(
-        dim < x_dim.size() && dim >= (0 - x_dim.size()),
-        true,
-        phi::errors::OutOfRange(
-            "Attr(dim) is out of range, It's expected "
-            "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
-            x_dim.size(),
-            x_dim.size() - 1,
-            dim));
-    if (dim < 0) {
-      dim += x_dim.size();
-    }
-    PADDLE_ENFORCE_EQ(x_dim[dim] == 3 && y_dim[dim] == 3,
-                      true,
-                      phi::errors::InvalidArgument(
-                          "Input(X/Y).dims()[dim] should be equal to 3."
-                          "But received Input(X/Y).dims()[dim] = %d.",
-                          x_dim[dim]));
-  }
-  out->set_dims(x_dim);
-  out->set_dtype(x.dtype());
-  out->set_layout(x.layout());
-  out->share_lod(x);
-}
-
-void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
-  out->share_meta(x);
-}
-
-void BCELossInferMeta(const MetaTensor& input,
-                      const MetaTensor& label,
-                      MetaTensor* out,
-                      MetaConfig config) {
-  auto input_dims = input.dims();
-  auto label_dims = label.dims();
-
-  int rank = input_dims.size();
-  PADDLE_ENFORCE_EQ(rank,
-                    label_dims.size(),
-                    phi::errors::InvalidArgument(
-                        "Input(X) and Input(Label) shall have the same rank."
-                        "But received: the rank of Input(X) is [%d], "
-                        "the rank of Input(Label) is [%d].",
-                        rank,
-                        label_dims.size()));
-
-  bool check = true;
-  if ((!config.is_runtime) &&
-      (phi::product(input_dims) <= 0 || phi::product(label_dims) <= 0)) {
-    check = false;
-  }
-
-  if (check) {
-    PADDLE_ENFORCE_EQ(input_dims,
-                      label_dims,
-                      phi::errors::InvalidArgument(
-                          "Input(X) and Input(Label) shall have the same "
-                          "shape. But received: the shape of Input(X) is "
-                          "[%s], the shape of Input(Label) is [%s].",
-                          input_dims,
-                          label_dims));
-  }
-
-  out->set_dims(input_dims);
-  out->set_dtype(input.dtype());
-  out->share_lod(input);
-}
 
-void BincountInferMeta(const MetaTensor& x,
-                       const paddle::optional<const MetaTensor&> weights,
-                       int minlength,
-                       MetaTensor* out) {
+void IndexSelectInferMeta(const MetaTensor& x,
+                          const MetaTensor& index,
+                          int dim,
+                          MetaTensor* output) {
   auto input_dim = x.dims();
-
-  PADDLE_ENFORCE_GE(minlength,
-                    0,
-                    phi::errors::InvalidArgument(
-                        "The minlength should be greater than or equal to 0."
-                        "But received minlength is %d",
-                        minlength));
+  auto index_dim = index.dims();
 
   PADDLE_ENFORCE_EQ(
-      input_dim.size(),
-      1,
-      phi::errors::InvalidArgument("The 'shape' of Input(X) must be 1-D tensor."
-                                   "But the dimension of Input(X) is [%d]",
-                                   input_dim.size()));
-
-  if (weights.is_initialized()) {
-    auto weights_dim = weights->dims();
-    PADDLE_ENFORCE_EQ(weights_dim.size(),
-                      1,
-                      phi::errors::InvalidArgument(
-                          "The 'shape' of Input(Weights) must be 1-D tensor."
-                          "But the dimension of Input(Weights) is [%d]",
-                          weights_dim.size()));
-
-    PADDLE_ENFORCE_EQ(
-        weights_dim[0],
-        input_dim[0],
-        phi::errors::InvalidArgument(
-            "The 'shape' of Input(Weights) must be equal to the 'shape' of "
-            "Input(X)."
-            "But received: the 'shape' of Input(Weights) is [%s],"
-            "the 'shape' of Input(X) is [%s]",
-            weights_dim,
-            input_dim));
-  }
-  out->set_dims(phi::make_ddim({-1}));
-  if (weights.is_initialized()) {
-    out->set_dtype(weights->dtype());
-  } else {
-    out->set_dtype(x.dtype());
-  }
-
-  out->share_lod(x);
-}
-
-void DistInferMeta(const MetaTensor& x,
-                   const MetaTensor& y,
-                   float p,
-                   MetaTensor* out) {
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-
-  PADDLE_ENFORCE_NE(phi::product(x_dims),
-                    0,
-                    phi::errors::InvalidArgument(
-                        "The Input(X) has not been initialized properly. The "
-                        "shape of Input(X) = [%s].",
-                        x_dims));
-  PADDLE_ENFORCE_NE(phi::product(y_dims),
-                    0,
-                    phi::errors::InvalidArgument(
-                        "The Input(Y) has not been initialized properly. The "
-                        "shape of Input(Y) = [%s].",
-                        y_dims));
-  out->set_dims({1});
-  out->set_dtype(x.dtype());
-}
-
-void GatherNdInferMeta(const MetaTensor& x,
-                       const MetaTensor& index,
-                       MetaTensor* out) {
-  auto x_dims = x.dims();
-  auto x_dims_size = x_dims.size();
-  auto index_dims = index.dims();
-  auto index_dims_size = index_dims.size();
+      dim < input_dim.size() && dim >= (0 - input_dim.size()),
+      true,
+      phi::errors::OutOfRange(
+          "Attr(dim) is out of range, It's expected "
+          "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
+          input_dim.size(),
+          input_dim.size() - 1,
+          dim));
 
-  PADDLE_ENFORCE_LE(
-      index_dims[index_dims_size - 1],
-      x_dims_size,
+  PADDLE_ENFORCE_EQ(
+      index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1),
+      true,
       phi::errors::InvalidArgument(
-          "Input(Index).shape[-1] should be no greater than Input(X).rank"));
-  PADDLE_ENFORCE_GE(index_dims_size,
-                    1UL,
-                    phi::errors::InvalidArgument(
-                        "The rank of Input(Index) should be greater than 1"));
+          "The 'shape' of Input(Index) must be 1-D tensor. "
+          "But received: the 'shape' of Input(Index) is [%s], "
+          "the dimension of Input(Index) is [%d].",
+          index_dim,
+          index_dim.size()));
 
-  std::vector<int64_t> result_dims;
-  // The result dims is
-  //   Index.shape[:-1] + X.shape[Index.shape[-1]:]
-  for (int i = 0; i < index_dims_size - 1; ++i) {
-    result_dims.emplace_back(index_dims[i]);
-  }
-  for (int i = index_dims[index_dims_size - 1]; i < x_dims_size; ++i) {
-    result_dims.emplace_back(x_dims[i]);
-  }
+  PADDLE_ENFORCE_EQ(
+      index_dim[0] != 0,
+      true,
+      phi::errors::InvalidArgument("The length of Input(Index) can't be 0."));
 
-  out->set_dims(phi::make_ddim(result_dims));
-  out->share_lod(x);
-  out->set_dtype(x.dtype());
+  auto output_dim = phi::vectorize(input_dim);
+  if (dim < 0) {
+    dim += input_dim.size();
+  }
+  output_dim[dim] = index_dim[0];
+  output->set_dims(phi::make_ddim(output_dim));
+  output->set_dtype(x.dtype());
+  output->set_layout(x.layout());
+  output->share_lod(x);
 }
 
-void GatherTreeMeta(const MetaTensor& ids,
-                    const MetaTensor& parents,
-                    MetaTensor* out) {
-  auto ids_dims = ids.dims();
-  auto parents_dims = parents.dims();
-  PADDLE_ENFORCE_EQ(ids_dims == parents_dims,
-                    true,
-                    phi::errors::InvalidArgument(
-                        "The shape of Input(Parents) must be same with the "
-                        "shape of Input(Ids)."));
-  out->set_dims(ids_dims);
+void KronInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
+  auto dim_x = x.dims();
+  auto dim_y = y.dims();
+  auto rank_x = dim_x.size();
+  auto rank_y = dim_y.size();
+  auto rank = (rank_x > rank_y) ? rank_x : rank_y;
+
+  std::vector<int64_t> dim_out;
+  dim_out.reserve(rank);
+  for (int i = 0; i < rank; i++) {
+    int64_t dim_xi = (i < rank - rank_x) ? 1 : dim_x.at(i - (rank - rank_x));
+    int64_t dim_yi = (i < rank - rank_y) ? 1 : dim_y.at(i - (rank - rank_y));
+    dim_out.push_back(dim_xi == -1 || dim_yi == -1 ? -1 : dim_xi * dim_yi);
+  }
+  out->set_dims(phi::make_ddim(dim_out));
+  out->set_dtype(x.dtype());
 }
 
 void LogLossInferMeta(const MetaTensor& input,
@@ -617,6 +1164,86 @@ void LogLossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
+void MaskedSelectInferMeta(const MetaTensor& x,
+                           const MetaTensor& mask,
+                           MetaTensor* out) {
+  out->set_dims({-1});  // can not infer
+  out->set_dtype(x.dtype());
+}
+
+void MatmulInferMeta(const MetaTensor& x,
+                     const MetaTensor& y,
+                     bool trans_x,
+                     bool trans_y,
+                     MetaTensor* out) {
+  std::vector<int64_t> dims_x = phi::vectorize(x.dims());
+  std::vector<int64_t> dims_y = phi::vectorize(y.dims());
+  auto ndims_x = dims_x.size();
+  auto ndims_y = dims_y.size();
+  PADDLE_ENFORCE_GT(ndims_x,
+                    0UL,
+                    phi::errors::InvalidArgument(
+                        "The Input(x) dims size must be greater than 0,"
+                        " but reviced dims size is 0. "));
+  PADDLE_ENFORCE_GT(ndims_y,
+                    0UL,
+                    phi::errors::InvalidArgument(
+                        "The Input(y) dims size must be greater than 0,"
+                        " but reviced dims size is 0. "));
+
+  bool x_broadcasted = false, y_broadcasted = false;
+  if (ndims_x == 1) {
+    dims_x.insert(dims_x.begin(), 1);
+    ndims_x = 2;
+    x_broadcasted = true;
+  }
+
+  if (ndims_y == 1) {
+    dims_y.push_back(1);
+    ndims_y = 2;
+    y_broadcasted = true;
+  }
+
+  size_t M, N;
+  if (trans_x) {
+    M = dims_x[ndims_x - 1];
+  } else {
+    M = dims_x[ndims_x - 2];
+  }
+  if (trans_y) {
+    N = dims_y[ndims_y - 2];
+  } else {
+    N = dims_y[ndims_y - 1];
+  }
+
+  std::vector<int64_t> new_dims;
+  if (ndims_x > ndims_y) {
+    new_dims.assign(dims_x.begin(), dims_x.end() - 2);
+  } else if (ndims_x < ndims_y) {
+    new_dims.assign(dims_y.begin(), dims_y.end() - 2);
+  } else {
+    new_dims.reserve(ndims_x);
+    for (size_t i = 0; i < ndims_x - 2; ++i) {
+      new_dims.push_back(std::max(dims_x[i], dims_y[i]));
+    }
+  }
+  if (!x_broadcasted) {
+    new_dims.push_back(M);
+  }
+  if (!y_broadcasted) {
+    new_dims.push_back(N);
+  }
+  if (x_broadcasted && y_broadcasted) {
+    new_dims.push_back(1);
+  }
+
+  auto ddim_out = phi::make_ddim(new_dims);
+
+  out->set_dims(ddim_out);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+}
+
 void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) {
   auto dim_x = x.dims();
   auto dim_vec = vec.dims();
@@ -647,6 +1274,176 @@ void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) {
   out->share_lod(x);
 }
 
+void PReluInferMeta(const MetaTensor& x,
+                    const MetaTensor& alpha,
+                    const std::string& mode,
+                    const std::string& data_format,
+                    MetaTensor* out,
+                    MetaConfig config) {
+  auto x_dim = x.dims();
+  if (mode == "all") {
+    PADDLE_ENFORCE_EQ(phi::product(alpha.dims()),
+                      1,
+                      phi::errors::InvalidArgument(
+                          "For mode 'all', size of weight Alpha must be one. "
+                          "But recevied alpha's size: %d.",
+                          product(alpha.dims())));
+  } else if (mode == "channel") {
+    auto x_rank = x_dim.size();
+    PADDLE_ENFORCE_GE(x_rank,
+                      2,
+                      phi::errors::InvalidArgument(
+                          "For mode 'channel', rank of input X must be "
+                          "equal or larger than 2. But recevied X's "
+                          "rank: %d",
+                          x_rank));
+    PADDLE_ENFORCE_EQ(data_format == "NCHW" || data_format == "NHWC",
+                      true,
+                      phi::errors::InvalidArgument(
+                          "For mode 'channel', data_format must be one of "
+                          "NCHW and NHWC. But recevied data_format: %s",
+                          data_format));
+    if (data_format == "NCHW" || config.is_run_mkldnn_kernel) {
+      PADDLE_ENFORCE_EQ(product(alpha.dims()) == x_dim[1],
+                        true,
+                        phi::errors::InvalidArgument(
+                            "For mode 'channel', size of weight Alpha must be "
+                            "equal to the number of channels of input(x). But "
+                            "recevied alpha's size: %d, x_dim[1]: %d",
+                            product(alpha.dims()),
+                            x_dim[1]));
+    } else {
+      PADDLE_ENFORCE_EQ(product(alpha.dims()) == x_dim[x_rank - 1],
+                        true,
+                        phi::errors::InvalidArgument(
+                            "For mode 'channel', size of weight Alpha must be "
+                            "equal to the number of channels of input(x). But "
+                            "recevied alpha's size: %d, x_dim[%d]: %d",
+                            product(alpha.dims()),
+                            x_rank - 1,
+                            x_dim[x_rank - 1]));
+    }
+  } else if (mode == "element") {
+    auto alpha_dim = alpha.dims();
+    auto alpha_rank = alpha_dim.size();
+    auto x_rank = x_dim.size();
+    PADDLE_ENFORCE_GE(x_rank,
+                      1,
+                      phi::errors::InvalidArgument(
+                          "For mode 'element', rank of input X must be "
+                          "equal or larger than 2. But recevied X's "
+                          "rank: %d",
+                          x_rank));
+    PADDLE_ENFORCE_EQ(
+        alpha_rank,
+        x_rank,
+        phi::errors::InvalidArgument(
+            "For mode 'element', rank of weight Alpha must be ",
+            "equal to the rank of input(x). But recevied alpha's rank: %d, "
+            "x's rank: %d.",
+            alpha_rank,
+            x_rank));
+    size_t x_product = 1;
+    size_t alpha_product = 1;
+    for (int64_t i = x_rank - 1; i > 0; i--) {
+      x_product *= x_dim[i];
+      alpha_product *= alpha_dim[i];
+    }
+    PADDLE_ENFORCE_EQ(
+        alpha_product,
+        x_product,
+        phi::errors::InvalidArgument(
+            "For mode 'element', the size of weight Alpha must be "
+            "equal to the size of input(x). But recevied alpha's size: %d, "
+            "x's size: %d.",
+            alpha_product,
+            x_product));
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Attr(mode) of prelu must be one of 'all', 'channel', or 'element'. "
+        "But recevied "
+        "mode: '%s'.",
+        mode));
+  }
+  out->set_dims(x_dim);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+}
+
+void SearchsortedInferMeta(const MetaTensor& sorted_sequence,
+                           const MetaTensor& value,
+                           bool out_int32,
+                           bool right,
+                           MetaTensor* out) {
+  auto sequences_dims = sorted_sequence.dims();
+  auto values_dims = value.dims();
+
+  bool flag = true;
+  if (sequences_dims.size() != values_dims.size()) {
+    flag = false;
+  }
+  const auto& sequences_dims_size = sequences_dims.size();
+  for (int64_t dim = 0; dim < sequences_dims_size - 1; ++dim) {
+    if (sequences_dims[dim] != values_dims[dim]) {
+      flag = false;
+      break;
+    }
+  }
+  if (sequences_dims.size() != 1) {
+    PADDLE_ENFORCE_EQ(
+        flag,
+        true,
+        phi::errors::Unavailable(
+            "The dimensions of sorted_sequence tensor ( %s ) and values "
+            "tensor ( %s ) can not match. Because the input sorted_sequence "
+            "tensor must be 1 dimension or the first N-1 dimensions of "
+            "sorted_sequence tensor and input values tensor must match. "
+            "Please input appropriate sorted_sequence and values again! ",
+            sequences_dims,
+            values_dims));
+  }
+
+  if (out_int32) {
+    PADDLE_ENFORCE_LT(
+        sequences_dims[sequences_dims.size() - 1],
+        std::numeric_limits<int>::max(),
+        phi::errors::Unavailable(
+            "The size of sorted_sequence %d exceed the maximum limit d%. "
+            "Because the size of sorted_sequence should be less than the "
+            "output maximum value for int32 bit. Please set appropriate "
+            "sorted_sequence to meet this requirement! ",
+            sequences_dims[sequences_dims.size() - 1],
+            std::numeric_limits<int>::max()));
+  }
+
+  out->set_dims(values_dims);
+  if (out_int32) {
+    out->set_dtype(DataType::INT32);
+  } else {
+    out->set_dtype(DataType::INT64);
+  }
+}
+
+void SegmentPoolInferMeta(const MetaTensor& x,
+                          const MetaTensor& segment_ids,
+                          const std::string& pooltype,
+                          MetaTensor* out,
+                          MetaTensor* summed_ids,
+                          MetaConfig config) {
+  auto dims = x.dims();
+  dims[0] = -1;
+  out->set_dims(dims);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+
+  if (pooltype == "MEAN") {
+    summed_ids->set_dims({-1, 1});
+    summed_ids->set_dtype(x.dtype());
+    summed_ids->set_layout(x.layout());
+  }
+}
+
 void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x,
                                             const MetaTensor& label,
                                             bool normalize,
@@ -688,4 +1485,188 @@ void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+void TriangularSolveInferMeta(const MetaTensor& x,
+                              const MetaTensor& y,
+                              bool upper,
+                              bool transpose,
+                              bool unitriangular,
+                              MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  auto x_dims_n = x_dims.size();
+  auto y_dims_n = y_dims.size();
+
+  PADDLE_ENFORCE_GE(x_dims_n,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The input tensor X's dimensions of TriangularSolveOp "
+                        "should be >= 2. But received X's "
+                        "dimensions = %d, X's shape = [%s]",
+                        x_dims.size(),
+                        x_dims));
+
+  PADDLE_ENFORCE_GE(y_dims_n,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The input tensor Y's dimensions of TriangularSolveOp "
+                        "should be >=2. But received Y's "
+                        "dimensions = %d, Y's shape = [%s]",
+                        y_dims.size(),
+                        y_dims));
+
+  PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2],
+                    x_dims[x_dims_n - 1],
+                    phi::errors::InvalidArgument(
+                        "The inner-most 2 dimensions of Input(X) all should "
+                        "be square matrices "
+                        "But received X's shape[-2] = %d and shape[-1] = %d.",
+                        x_dims[x_dims_n - 2],
+                        x_dims[x_dims_n - 1]));
+
+  std::vector<int64_t> x_dims_vec = phi::vectorize(x_dims);
+  std::vector<int64_t> y_dims_vec = phi::vectorize(y_dims);
+
+  std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2);
+  std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2);
+
+  std::vector<int64_t> expand_batch_portion =
+      funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut);
+
+  std::vector<int64_t> y_broadcast_dims({expand_batch_portion});
+  y_broadcast_dims.insert(y_broadcast_dims.end(),
+                          {y_dims_vec[y_dims_n - 2], y_dims_vec[y_dims_n - 1]});
+
+  // dim of 'out' is the same with 'Y' after broadcast
+  out->set_dims(phi::make_ddim(y_broadcast_dims));
+  out->set_dtype(y.dtype());
+  out->set_layout(y.layout());
+  out->share_lod(y);
+}
+
+void YoloBoxInferMeta(const MetaTensor& x,
+                      const MetaTensor& img_size,
+                      const std::vector<int>& anchors,
+                      int class_num,
+                      float conf_thresh,
+                      int downsample_ratio,
+                      bool clip_bbox,
+                      float scale_x_y,
+                      bool iou_aware,
+                      float iou_aware_factor,
+                      MetaTensor* boxes,
+                      MetaTensor* scores,
+                      MetaConfig config) {
+  auto dim_x = x.dims();
+  auto dim_imgsize = img_size.dims();
+  int anchor_num = anchors.size() / 2;
+
+  PADDLE_ENFORCE_EQ(
+      dim_x.size(),
+      4,
+      phi::errors::InvalidArgument("Input(X) should be a 4-D tensor."
+                                   "But received X dimension(%s)",
+                                   dim_x.size()));
+  if (iou_aware) {
+    PADDLE_ENFORCE_EQ(
+        dim_x[1],
+        anchor_num * (6 + class_num),
+        phi::errors::InvalidArgument(
+            "Input(X) dim[1] should be equal to (anchor_mask_number * (6 "
+            "+ class_num)) while iou_aware is true."
+            "But received dim[1](%s) != (anchor_mask_number * "
+            "(6+class_num)(%s).",
+            dim_x[1],
+            anchor_num * (6 + class_num)));
+    PADDLE_ENFORCE_GE(
+        iou_aware_factor,
+        0,
+        phi::errors::InvalidArgument(
+            "Attr(iou_aware_factor) should greater than or equal to 0."
+            "But received iou_aware_factor (%s)",
+            iou_aware_factor));
+    PADDLE_ENFORCE_LE(
+        iou_aware_factor,
+        1,
+        phi::errors::InvalidArgument(
+            "Attr(iou_aware_factor) should less than or equal to 1."
+            "But received iou_aware_factor (%s)",
+            iou_aware_factor));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        dim_x[1],
+        anchor_num * (5 + class_num),
+        phi::errors::InvalidArgument(
+            "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
+            "+ class_num))."
+            "But received dim[1](%s) != (anchor_mask_number * "
+            "(5+class_num)(%s).",
+            dim_x[1],
+            anchor_num * (5 + class_num)));
+  }
+  PADDLE_ENFORCE_EQ(
+      dim_imgsize.size(),
+      2,
+      phi::errors::InvalidArgument("Input(ImgSize) should be a 2-D tensor."
+                                   "But received Imgsize size(%s)",
+                                   dim_imgsize.size()));
+  if ((dim_imgsize[0] > 0 && dim_x[0] > 0) || config.is_runtime) {
+    PADDLE_ENFORCE_EQ(
+        dim_imgsize[0],
+        dim_x[0],
+        phi::errors::InvalidArgument(
+            "Input(ImgSize) dim[0] and Input(X) dim[0] should be same."));
+  }
+  PADDLE_ENFORCE_EQ(
+      dim_imgsize[1],
+      2,
+      phi::errors::InvalidArgument("Input(ImgSize) dim[1] should be 2."
+                                   "But received imgsize dim[1](%s).",
+                                   dim_imgsize[1]));
+  PADDLE_ENFORCE_GT(anchors.size(),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "Attr(anchors) length should be greater than 0."
+                        "But received anchors length(%s).",
+                        anchors.size()));
+  PADDLE_ENFORCE_EQ(anchors.size() % 2,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "Attr(anchors) length should be even integer."
+                        "But received anchors length (%s)",
+                        anchors.size()));
+  PADDLE_ENFORCE_GT(class_num,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "Attr(class_num) should be an integer greater than 0."
+                        "But received class_num (%s)",
+                        class_num));
+
+  int box_num;
+  if ((dim_x[2] > 0 && dim_x[3] > 0) || config.is_runtime) {
+    box_num = dim_x[2] * dim_x[3] * anchor_num;
+  } else {
+    box_num = -1;
+  }
+  std::vector<int64_t> dim_boxes({dim_x[0], box_num, 4});
+  boxes->set_dims(phi::make_ddim(dim_boxes));
+  boxes->set_dtype(x.dtype());
+
+  std::vector<int64_t> dim_scores({dim_x[0], box_num, class_num});
+  scores->set_dims(phi::make_ddim(dim_scores));
+}
+
+void ValueCompareInferMeta(const MetaTensor& x,
+                           const MetaTensor& y,
+                           MetaTensor* out,
+                           MetaConfig config) {
+  detail::BinarySameInputDimsCheck(x, y, config);
+
+  out->set_dims(x.dims());
+  out->set_dtype(DataType::BOOL);
+}
+
 }  // namespace phi
+
+PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta);
+PD_REGISTER_INFER_META_FN(conv2d, phi::ConvInferMeta);
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 934ed688bf2df..9a54c4c5fa62d 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/meta_tensor.h"
 
 namespace phi {
@@ -28,23 +29,84 @@ namespace phi {
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
+//
+// The InferMeta Functions in this file are arranged in alphabetic order.
+
+void AllValueCompareInferMeta(const MetaTensor& x,
+                              const MetaTensor& y,
+                              MetaTensor* out,
+                              MetaConfig config = MetaConfig());
+
+void KLDivInferMeta(const MetaTensor& x,
+                    const MetaTensor& label,
+                    const std::string& reduction,
+                    MetaTensor* out,
+                    MetaConfig config = MetaConfig());
+
+void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
+
+void BCELossInferMeta(const MetaTensor& input,
+                      const MetaTensor& label,
+                      MetaTensor* out,
+                      MetaConfig config = MetaConfig());
+
+void BincountInferMeta(const MetaTensor& x,
+                       const paddle::optional<const MetaTensor&> weights,
+                       int minlength,
+                       MetaTensor* out);
+
+void CholeskySolveInferMeta(const MetaTensor& x,
+                            const MetaTensor& y,
+                            bool upper,
+                            MetaTensor* out);
+
+void CompareAllInferMeta(const MetaTensor& x,
+                         const MetaTensor& y,
+                         MetaTensor* out);
 
 void CompareInferMeta(const MetaTensor& x,
                       const MetaTensor& y,
                       int axis,
                       MetaTensor* out);
 
-void CompareAllInferMeta(const MetaTensor& x,
-                         const MetaTensor& y,
-                         MetaTensor* out);
+void ConvInferMeta(const MetaTensor& input,
+                   const MetaTensor& filter,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings,
+                   const std::string& paddding_algorithm,
+                   int groups,
+                   const std::vector<int>& dilations,
+                   const std::string& data_format,
+                   bool use_addto,
+                   int workspace_size_MB,
+                   bool exhaustive_search,
+                   MetaTensor* out,
+                   MetaConfig config = MetaConfig());
+
+void ConvTransposeInferMeta(const MetaTensor& x,
+                            const MetaTensor& filter,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            const std::vector<int>& output_padding,
+                            const std::vector<int>& output_size,
+                            const std::string& padding_algorithm,
+                            int groups,
+                            const std::vector<int>& dilations,
+                            const std::string& data_format,
+                            MetaTensor* out,
+                            MetaConfig config = MetaConfig());
 
-void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
+void CrossInferMeta(const MetaTensor& x,
+                    const MetaTensor& y,
+                    int axis,
+                    MetaTensor* out);
 
-void MatmulInferMeta(const MetaTensor& x,
-                     const MetaTensor& y,
-                     bool trans_x,
-                     bool trans_y,
-                     MetaTensor* out);
+void DistInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   float p,
+                   MetaTensor* out);
+
+void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
 
 void ElementwiseInferMeta(const MetaTensor& x,
                           const MetaTensor& y,
@@ -55,6 +117,29 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta,
                              int axis,
                              MetaTensor* out);
 
+void ExpandAsInferMeta(const MetaTensor& x,
+                       paddle::optional<const MetaTensor&> y,
+                       const std::vector<int>& target_shape,
+                       MetaTensor* out);
+
+void GatherInferMeta(const MetaTensor& x,
+                     const MetaTensor& index,
+                     const Scalar& axis,
+                     MetaTensor* out);
+
+void GatherNdInferMeta(const MetaTensor& x,
+                       const MetaTensor& index,
+                       MetaTensor* out);
+
+void GatherTreeMeta(const MetaTensor& ids,
+                    const MetaTensor& parents,
+                    MetaTensor* out);
+
+void GridSampleBaseInferMeta(const MetaTensor& x,
+                             const MetaTensor& grid,
+                             MetaTensor* out,
+                             MetaConfig config = MetaConfig());
+
 void HuberLossInferMeta(const MetaTensor& input_meta,
                         const MetaTensor& label_meta,
                         float delta,
@@ -62,46 +147,17 @@ void HuberLossInferMeta(const MetaTensor& input_meta,
                         MetaTensor* residual,
                         MetaConfig config = MetaConfig());
 
-void TriangularSolveInferMeta(const MetaTensor& x,
-                              const MetaTensor& y,
-                              bool upper,
-                              bool transpose,
-                              bool unitriangular,
-                              MetaTensor* out);
-
 void IndexSampleInferMeta(const MetaTensor& x,
                           const MetaTensor& y,
                           MetaTensor* out,
                           MetaConfig config = MetaConfig());
 
-void CrossInferMeta(const MetaTensor& x,
-                    const MetaTensor& y,
-                    int axis,
-                    MetaTensor* out);
+void IndexSelectInferMeta(const MetaTensor& x,
+                          const MetaTensor& index,
+                          int dim,
+                          MetaTensor* output);
 
-void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
-void BCELossInferMeta(const MetaTensor& input,
-                      const MetaTensor& label,
-                      MetaTensor* out,
-                      MetaConfig config = MetaConfig());
-
-void BincountInferMeta(const MetaTensor& x,
-                       const paddle::optional<const MetaTensor&> weights,
-                       int minlength,
-                       MetaTensor* out);
-
-void DistInferMeta(const MetaTensor& x,
-                   const MetaTensor& y,
-                   float p,
-                   MetaTensor* out);
-
-void GatherNdInferMeta(const MetaTensor& x,
-                       const MetaTensor& index,
-                       MetaTensor* out);
-
-void GatherTreeMeta(const MetaTensor& ids,
-                    const MetaTensor& parents,
-                    MetaTensor* out);
+void KronInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
 
 void LogLossInferMeta(const MetaTensor& input,
                       const MetaTensor& label,
@@ -109,8 +165,38 @@ void LogLossInferMeta(const MetaTensor& input,
                       MetaTensor* out,
                       MetaConfig config = MetaConfig());
 
+void MaskedSelectInferMeta(const MetaTensor& x,
+                           const MetaTensor& mask,
+                           MetaTensor* out);
+
+void MatmulInferMeta(const MetaTensor& x,
+                     const MetaTensor& y,
+                     bool trans_x,
+                     bool trans_y,
+                     MetaTensor* out);
+
 void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out);
 
+void PReluInferMeta(const MetaTensor& x,
+                    const MetaTensor& alpha,
+                    const std::string& mode,
+                    const std::string& data_format,
+                    MetaTensor* out,
+                    MetaConfig config);
+
+void SearchsortedInferMeta(const MetaTensor& sorted_sequence,
+                           const MetaTensor& value,
+                           bool out_int32,
+                           bool right,
+                           MetaTensor* out);
+
+void SegmentPoolInferMeta(const MetaTensor& x,
+                          const MetaTensor& segment_ids,
+                          const std::string& pooltype,
+                          MetaTensor* out,
+                          MetaTensor* summed_ids,
+                          MetaConfig config = MetaConfig());
+
 void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x,
                                             const MetaTensor& label,
                                             bool normalize,
@@ -118,4 +204,30 @@ void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x,
                                             MetaTensor* out,
                                             MetaConfig config = MetaConfig());
 
+void TriangularSolveInferMeta(const MetaTensor& x,
+                              const MetaTensor& y,
+                              bool upper,
+                              bool transpose,
+                              bool unitriangular,
+                              MetaTensor* out);
+
+void YoloBoxInferMeta(const MetaTensor& x,
+                      const MetaTensor& img_size,
+                      const std::vector<int>& anchors,
+                      int class_num,
+                      float conf_thresh,
+                      int downsample_ratio,
+                      bool clip_bbox,
+                      float scale_x_y,
+                      bool iou_aware,
+                      float iou_aware_factor,
+                      MetaTensor* boxes,
+                      MetaTensor* scores,
+                      MetaConfig config = MetaConfig());
+
+void ValueCompareInferMeta(const MetaTensor& x,
+                           const MetaTensor& y,
+                           MetaTensor* out,
+                           MetaConfig config = MetaConfig());
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index acce40713b821..3faf42fe1ab1a 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include "paddle/phi/infermeta/multiary.h"
 #include <vector>
+#include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 namespace phi {
@@ -28,6 +30,98 @@ std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors) {
   return dims;
 }
 
+void AdadeltaInferMeta(const MetaTensor& param,
+                       const MetaTensor& grad,
+                       const MetaTensor& avg_squared_grad,
+                       const MetaTensor& avg_squared_update,
+                       float rho,
+                       float epsilon,
+                       MetaTensor* param_out,
+                       MetaTensor* avg_squared_grad_out,
+                       MetaTensor* avg_squared_update_out) {
+  auto param_dims = param.dims();
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      grad.dims(),
+      errors::InvalidArgument(
+          "Param and grad input of AdadeltaOp should have same dimension."));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      avg_squared_grad.dims(),
+      errors::InvalidArgument("Param and AvgSquaredGrad input of AdadeltaOp "
+                              "should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      avg_squared_update.dims(),
+      errors::InvalidArgument("Param and AvgSquaredUpdate input of AdadeltaOp "
+                              "should have same dimension"));
+
+  param_out->set_dims(param_dims);
+  param_out->set_dtype(param.dtype());
+
+  avg_squared_grad_out->set_dims(param_dims);
+  avg_squared_grad_out->set_dtype(avg_squared_grad.dtype());
+
+  avg_squared_update_out->set_dims(param_dims);
+  avg_squared_update_out->set_dtype(avg_squared_update.dtype());
+}
+
+void AdamaxInferMeta(const MetaTensor& param,
+                     const MetaTensor& grad,
+                     const MetaTensor& learning_rate,
+                     const MetaTensor& moment,
+                     const MetaTensor& inf_norm,
+                     const MetaTensor& beta1_pow,
+                     float beta1,
+                     float beta2,
+                     float epsilon,
+                     MetaTensor* param_out,
+                     MetaTensor* moment_out,
+                     MetaTensor* inf_norm_out) {
+  auto lr_dims = learning_rate.dims();
+  PADDLE_ENFORCE_NE(
+      product(lr_dims),
+      0,
+      errors::InvalidArgument("Maybe the Input variable LearningRate has not "
+                              "been initialized. You may need to confirm "
+                              "if you put exe.run(startup_program) "
+                              "after optimizer.minimize function."));
+  PADDLE_ENFORCE_EQ(
+      product(lr_dims),
+      1,
+      errors::InvalidArgument("Learning rate should have 1 dimension"));
+  auto beta1_pow_dims = beta1_pow.dims();
+  PADDLE_ENFORCE_EQ(product(beta1_pow_dims),
+                    1,
+                    errors::InvalidArgument(
+                        "Beta1 power accumulator should have 1 dimension"));
+  auto param_dims = param.dims();
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      grad.dims(),
+      errors::InvalidArgument(
+          "Param and Grad input of AdamaxOp should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      moment.dims(),
+      errors::InvalidArgument(
+          "Param and Moment input of AdamaxOp should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      inf_norm.dims(),
+      errors::InvalidArgument(
+          "Param and InfNorm input of AdamaxOp should have same dimension"));
+
+  param_out->set_dims(param_dims);
+  param_out->set_dtype(param.dtype());
+
+  moment_out->set_dims(param_dims);
+  moment_out->set_dtype(moment.dtype());
+
+  inf_norm_out->set_dims(param_dims);
+  inf_norm_out->set_dtype(inf_norm.dtype());
+}
+
 void AucInferMeta(const MetaTensor& input,
                   const MetaTensor& label,
                   const MetaTensor& stat_pos,
@@ -108,96 +202,149 @@ void AucInferMeta(const MetaTensor& input,
   }
 }
 
-void AdamaxInferMeta(const MetaTensor& param,
-                     const MetaTensor& grad,
-                     const MetaTensor& learning_rate,
-                     const MetaTensor& moment,
-                     const MetaTensor& inf_norm,
-                     const MetaTensor& beta1_pow,
-                     float beta1,
-                     float beta2,
-                     float epsilon,
-                     MetaTensor* param_out,
-                     MetaTensor* moment_out,
-                     MetaTensor* inf_norm_out) {
-  auto lr_dims = learning_rate.dims();
-  PADDLE_ENFORCE_NE(
-      product(lr_dims),
-      0,
-      errors::InvalidArgument("Maybe the Input variable LearningRate has not "
-                              "been initialized. You may need to confirm "
-                              "if you put exe.run(startup_program) "
-                              "after optimizer.minimize function."));
-  PADDLE_ENFORCE_EQ(
-      product(lr_dims),
-      1,
-      errors::InvalidArgument("Learning rate should have 1 dimension"));
-  auto beta1_pow_dims = beta1_pow.dims();
-  PADDLE_ENFORCE_EQ(product(beta1_pow_dims),
-                    1,
-                    errors::InvalidArgument(
-                        "Beta1 power accumulator should have 1 dimension"));
-  auto param_dims = param.dims();
-  PADDLE_ENFORCE_EQ(
-      param_dims,
-      grad.dims(),
-      errors::InvalidArgument(
-          "Param and Grad input of AdamaxOp should have same dimension"));
-  PADDLE_ENFORCE_EQ(
-      param_dims,
-      moment.dims(),
-      errors::InvalidArgument(
-          "Param and Moment input of AdamaxOp should have same dimension"));
-  PADDLE_ENFORCE_EQ(
-      param_dims,
-      inf_norm.dims(),
-      errors::InvalidArgument(
-          "Param and InfNorm input of AdamaxOp should have same dimension"));
-
-  param_out->set_dims(param_dims);
-  param_out->set_dtype(param.dtype());
+void BatchNormInferMeta(const MetaTensor& x,
+                        const MetaTensor& scale,
+                        const MetaTensor& bias,
+                        const MetaTensor& mean,
+                        const MetaTensor& variance,
+                        float momentum,
+                        float epsilon,
+                        const std::string& data_layout_str,
+                        bool is_test,
+                        bool use_global_stats,
+                        bool trainable_statistics,
+                        bool fuse_with_relu,
+                        MetaTensor* y,
+                        MetaTensor* mean_out,
+                        MetaTensor* variance_out,
+                        MetaTensor* saved_mean,
+                        MetaTensor* saved_variance,
+                        MetaTensor* reserve_space,
+                        MetaConfig config) {
+  const auto x_dims = x.dims();
+  for (int i = 0; i < x_dims.size(); i++) {
+    PADDLE_ENFORCE_EQ(
+        (x_dims[i] == -1) || (x_dims[i] > 0),
+        true,
+        phi::errors::InvalidArgument(
+            "Each dimension of input tensor is expected to be -1 or a "
+            "positive number, but recieved %d. Input's shape is [%s].",
+            x_dims[i],
+            x_dims));
+  }
 
-  moment_out->set_dims(param_dims);
-  moment_out->set_dtype(moment.dtype());
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
 
-  inf_norm_out->set_dims(param_dims);
-  inf_norm_out->set_dtype(inf_norm.dtype());
-}
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "ShapeError: the dimension of input "
+          "X must greater than or equal to 2. But received: the shape of input "
+          "X = [%s], the dimension of input X =[%d]",
+          x_dims,
+          x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument(
+          "ShapeError: the dimension of input X "
+          "must smaller than or equal to 5. But received: the shape of input X "
+          "= [%s], the dimension of input X = [%d]",
+          x_dims,
+          x_dims.size()));
+
+  const int64_t C = ((config.is_run_mkldnn_kernel == true) ||
+                             (data_layout == DataLayout::kNCHW)
+                         ? x_dims[1]
+                         : x_dims[x_dims.size() - 1]);
+  auto scale_dim = scale.dims();
+  auto bias_dim = bias.dims();
 
-void AdadeltaInferMeta(const MetaTensor& param,
-                       const MetaTensor& grad,
-                       const MetaTensor& avg_squared_grad,
-                       const MetaTensor& avg_squared_update,
-                       float rho,
-                       float epsilon,
-                       MetaTensor* param_out,
-                       MetaTensor* avg_squared_grad_out,
-                       MetaTensor* avg_squared_update_out) {
-  auto param_dims = param.dims();
-  PADDLE_ENFORCE_EQ(
-      param_dims,
-      grad.dims(),
-      errors::InvalidArgument(
-          "Param and grad input of AdadeltaOp should have same dimension."));
-  PADDLE_ENFORCE_EQ(
-      param_dims,
-      avg_squared_grad.dims(),
-      errors::InvalidArgument("Param and AvgSquaredGrad input of AdadeltaOp "
-                              "should have same dimension"));
   PADDLE_ENFORCE_EQ(
-      param_dims,
-      avg_squared_update.dims(),
-      errors::InvalidArgument("Param and AvgSquaredUpdate input of AdadeltaOp "
-                              "should have same dimension"));
-
-  param_out->set_dims(param_dims);
-  param_out->set_dtype(param.dtype());
+      scale_dim.size(),
+      1UL,
+      phi::errors::InvalidArgument(
+          "ShapeError: the dimension of scale must equal to 1."
+          "But received: the shape of scale is [%s], the dimension "
+          "of scale is [%d]",
+          scale_dim,
+          scale_dim.size()));
+  PADDLE_ENFORCE_EQ(bias_dim.size(),
+                    1UL,
+                    phi::errors::InvalidArgument(
+                        "ShapeError: the dimension of bias must equal to 1."
+                        "But received: the shape of bias is [%s],the dimension "
+                        "of bias is [%d]",
+                        bias_dim,
+                        bias_dim.size()));
+
+  bool check = true;
+  if ((!config.is_runtime) &&
+      (phi::product(scale_dim) <= 0 || phi::product(bias_dim) <= 0)) {
+    check = false;
+  }
 
-  avg_squared_grad_out->set_dims(param_dims);
-  avg_squared_grad_out->set_dtype(avg_squared_grad.dtype());
+  if (check) {
+    PADDLE_ENFORCE_EQ(scale_dim[0],
+                      C,
+                      phi::errors::InvalidArgument(
+                          "ShapeError: the shape of scale must equal to [%d]"
+                          "But received: the shape of scale is [%d]",
+                          C,
+                          scale_dim[0]));
+    PADDLE_ENFORCE_EQ(bias_dim[0],
+                      C,
+                      phi::errors::InvalidArgument(
+                          "ShapeError: the shape of bias must equal to [%d]"
+                          "But received: the shape of bias is [%d]",
+                          C,
+                          bias_dim[0]));
+  }
+  y->set_dims(x_dims);
+  mean_out->set_dims({C});
+  variance_out->set_dims({C});
+  if (saved_mean) {
+    saved_mean->set_dims({C});
+  }
+  if (saved_variance) {
+    saved_variance->set_dims({C});
+  }
+  y->share_lod(x);
+}
 
-  avg_squared_update_out->set_dims(param_dims);
-  avg_squared_update_out->set_dtype(avg_squared_update.dtype());
+void BatchNormInferInferMeta(const MetaTensor& x,
+                             const MetaTensor& scale,
+                             const MetaTensor& bias,
+                             const MetaTensor& mean,
+                             const MetaTensor& variance,
+                             float momentum,
+                             float epsilon,
+                             const std::string& data_layout,
+                             MetaTensor* y,
+                             MetaTensor* mean_out,
+                             MetaTensor* variance_out,
+                             MetaConfig config) {
+  BatchNormInferMeta(x,
+                     scale,
+                     bias,
+                     mean,
+                     variance,
+                     momentum,
+                     epsilon,
+                     data_layout,
+                     /*is_test=*/true,
+                     /*use_global_stats=*/false,
+                     /*trainable_statistics=*/false,
+                     /*fuse_with_relu=*/false,
+                     y,
+                     mean_out,
+                     variance_out,
+                     /*saved_mean=*/nullptr,
+                     /*saved_variance=*/nullptr,
+                     /*reserve_space=*/nullptr,
+                     config);
 }
 
 void BilinearTensorProductInferMeta(const MetaTensor& x,
@@ -369,6 +516,188 @@ void ConcatInferMeta(const std::vector<MetaTensor*>& x,
   out->share_lod(*x.at(0));
 }
 
+void HierarchicalSigmoidInferMeta(const MetaTensor& x,
+                                  const MetaTensor& w,
+                                  const MetaTensor& label,
+                                  paddle::optional<const MetaTensor&> path,
+                                  paddle::optional<const MetaTensor&> code,
+                                  paddle::optional<const MetaTensor&> bias,
+                                  int num_classes,
+                                  bool remote_prefetch,
+                                  int trainer_id,
+                                  const std::vector<int64_t>& height_sections,
+                                  const std::vector<std::string>& epmap,
+                                  const std::vector<std::string>& table_names,
+                                  bool is_sparse,
+                                  MetaTensor* out,
+                                  MetaTensor* pre_out,
+                                  MetaTensor* w_out) {
+  const int64_t input_dims = x.dims()[0];
+  const int64_t label_dims = label.dims()[0];
+  PADDLE_ENFORCE_EQ(input_dims,
+                    label_dims,
+                    phi::errors::InvalidArgument(
+                        "The first dimension of "
+                        "input and label is expected to be the same. "
+                        "But received input's first dimension is %d; "
+                        "label's first dimension is %d.",
+                        input_dims,
+                        label_dims));
+
+  std::vector<int64_t> output_shape({input_dims, 1});
+  out->set_dims(phi::make_ddim(output_shape));
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
+void MultiDotInferMeta(const std::vector<MetaTensor*>& x, MetaTensor* out) {
+  auto inputs_dims = GetMetaTensorsDim(x);
+
+  const size_t inputs_num = inputs_dims.size();
+  PADDLE_ENFORCE_GT(
+      inputs_num,
+      static_cast<size_t>(1),
+      phi::errors::InvalidArgument(
+          "The number of input tensors in multi_dot op should > 1."));
+
+  const size_t n = inputs_dims.size();
+  auto first_dim = inputs_dims[0];
+
+  bool is_vector = false;
+  phi::DDim out_dim;
+
+  PADDLE_ENFORCE_LT(
+      first_dim.size(),
+      static_cast<size_t>(3),
+      phi::errors::InvalidArgument(
+          "multi_dot: the first input tensor must be 1D or 2D but got[%d]!",
+          static_cast<int>(first_dim.size())));
+
+  // If the first tensor is 1D of size n view it as a row vector (1, n)
+  if (first_dim.size() == 1) {
+    first_dim = phi::make_ddim({1, static_cast<int>(first_dim[0])});
+    is_vector = true;
+  }
+
+  auto last_dim = inputs_dims[n - 1];
+  PADDLE_ENFORCE_LT(
+      last_dim.size(),
+      static_cast<size_t>(3),
+      phi::errors::InvalidArgument(
+          "the last input tensor of multi_dot must be 1D or 2D but got[%d]!",
+          static_cast<int>(first_dim.size())));
+
+  // If the last tensor is 1D of size n view it as a column vector (n, 1)
+  if (last_dim.size() == 1) {
+    last_dim = phi::make_ddim({static_cast<int>(last_dim[0]), 1});
+    out_dim = is_vector ? phi::make_ddim({1}) : phi::make_ddim({first_dim[0]});
+  } else {
+    out_dim = is_vector ? phi::make_ddim({last_dim[1]})
+                        : phi::make_ddim({first_dim[0], last_dim[1]});
+  }
+
+  auto width = first_dim[1];
+  for (size_t i = 1; i < n - 1; i++) {
+    PADDLE_ENFORCE_EQ(inputs_dims[i].size(),
+                      static_cast<size_t>(2),
+                      phi::errors::InvalidArgument(
+                          "the input tensor of multi_dot op must be 2D."));
+
+    const auto& tmp_dim = inputs_dims[i];
+    PADDLE_ENFORCE_EQ(
+        tmp_dim[0],
+        width,
+        phi::errors::InvalidArgument(
+            "the input matrix does not meet the multiplication requirements."));
+    width = tmp_dim[1];
+  }
+
+  PADDLE_ENFORCE_EQ(
+      last_dim[0],
+      width,
+      phi::errors::InvalidArgument(
+          "the input matrix does not meet the multiplication requirements."));
+
+  out->set_dims(out_dim);
+  out->set_dtype(x.at(0)->dtype());
+  out->share_lod(*x.at(0));
+}
+
+void PsroiPoolInferMeta(const MetaTensor& x,
+                        const MetaTensor& rois,
+                        paddle::optional<const MetaTensor&> rois_num,
+                        int pooled_height,
+                        int pooled_width,
+                        int output_channels,
+                        float spatial_scale,
+                        MetaTensor* out) {
+  auto input_dims = x.dims();
+  auto rois_dims = rois.dims();
+
+  PADDLE_ENFORCE_EQ(
+      input_dims.size(),
+      4,
+      errors::InvalidArgument("The format of input tensor is NCHW"));
+  PADDLE_ENFORCE_EQ(rois_dims.size(),
+                    2,
+                    errors::InvalidArgument(
+                        "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
+                        "given as [(x1, y1, x2, y2), ...]"));
+  PADDLE_ENFORCE_EQ(rois_dims[1],
+                    4,
+                    errors::InvalidArgument(
+                        "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
+                        "given as [(x1, y1, x2, y2), ...]"));
+  if (rois_num.get_ptr()) {
+    auto rois_num_dims = rois_num->dims();
+    PADDLE_ENFORCE_EQ(
+        rois_num_dims.size(),
+        1,
+        errors::InvalidArgument("The second dimension of RoisNum should "
+                                "be 1, but received dimension is %d",
+                                rois_num_dims.size()));
+  }
+
+  PADDLE_ENFORCE_EQ(
+      input_dims[1],
+      output_channels * pooled_height * pooled_width,
+      errors::InvalidArgument(
+          "the channel of X(%d) "
+          "should be equal to the product of "
+          "output_channels(%d), pooled_height(%d) and pooled_width(%d)",
+          input_dims[1],
+          output_channels,
+          pooled_height,
+          pooled_width));
+
+  PADDLE_ENFORCE_GT(pooled_height,
+                    0,
+                    errors::InvalidArgument(
+                        "The pooled output height must be greater than 0"));
+  PADDLE_ENFORCE_GT(pooled_width,
+                    0,
+                    errors::InvalidArgument(
+                        "The pooled output width must be greater than 0"));
+  PADDLE_ENFORCE_GT(output_channels,
+                    1,
+                    errors::InvalidArgument(
+                        "The pooled output channels must greater than 1"));
+  PADDLE_ENFORCE_GT(
+      spatial_scale,
+      0.0f,
+      errors::InvalidArgument("The spatial scale must greater than 0."));
+
+  auto out_dims = input_dims;
+  out_dims[0] = rois_dims[0];
+  out_dims[1] =
+      output_channels;  // input_dims[1] / (pooled_height * pooled_width);
+  out_dims[2] = pooled_height;
+  out_dims[3] = pooled_width;
+
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+}
+
 void WhereInferMeta(const MetaTensor& condition,
                     const MetaTensor& x,
                     const MetaTensor& y,
@@ -395,3 +724,6 @@ void WhereInferMeta(const MetaTensor& condition,
 }
 
 }  // namespace phi
+
+PD_REGISTER_INFER_META_FN(batch_norm, phi::BatchNormInferMeta);
+PD_REGISTER_INFER_META_FN(batch_norm_infer, phi::BatchNormInferInferMeta);
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 26bdc62302f18..e9b5d8c872fb9 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -18,8 +18,48 @@ limitations under the License. */
 #include "paddle/phi/core/meta_tensor.h"
 namespace phi {
 
+// Common InferMeta Functions for multiary operators, The format like:
+//
+//   1. The number of input MetaTensor is more than 3:
+//      void [FunctionDesc|OpName]InferMeta(const MetaTensor& x,
+//                                          const MetaTensor& y,
+//                                          const MetaTensor& z,
+//                                          const MetaTensor& w,
+//                                          ...,
+//                                          MetaTensor* out) {}
+//
+//   2. There are `const vector<MetaTensor*>&` in params:
+//      void [FunctionDesc|OpName]InferMeta(const vector<MetaTensor*>& x,
+//                                          ...,
+//                                          MetaTensor* out) {}
+//
+// NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
+
 std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors);
 
+void AdadeltaInferMeta(const MetaTensor& param,
+                       const MetaTensor& grad,
+                       const MetaTensor& avg_squared_grad,
+                       const MetaTensor& avg_squared_update,
+                       float rho,
+                       float epsilon,
+                       MetaTensor* param_out,
+                       MetaTensor* avg_squared_grad_out,
+                       MetaTensor* avg_squared_update_out);
+
+void AdamaxInferMeta(const MetaTensor& param,
+                     const MetaTensor& grad,
+                     const MetaTensor& learning_rate,
+                     const MetaTensor& moment,
+                     const MetaTensor& inf_norm,
+                     const MetaTensor& beta1_pow,
+                     float beta1,
+                     float beta2,
+                     float epsilon,
+                     MetaTensor* param_out,
+                     MetaTensor* moment_out,
+                     MetaTensor* inf_norm_out);
+
 void AucInferMeta(const MetaTensor& input,
                   const MetaTensor& label,
                   const MetaTensor& stat_pos,
@@ -32,6 +72,39 @@ void AucInferMeta(const MetaTensor& input,
                   MetaTensor* stat_neg_out,
                   MetaConfig config = MetaConfig());
 
+void BatchNormInferMeta(const MetaTensor& x,
+                        const MetaTensor& scale,
+                        const MetaTensor& bias,
+                        const MetaTensor& mean,
+                        const MetaTensor& variance,
+                        float momentum,
+                        float epsilon,
+                        const std::string& data_layout,
+                        bool is_test,
+                        bool use_global_stats,
+                        bool trainable_statistics,
+                        bool fuse_with_relu,
+                        MetaTensor* y,
+                        MetaTensor* mean_out,
+                        MetaTensor* variance_out,
+                        MetaTensor* saved_mean,
+                        MetaTensor* saved_variance,
+                        MetaTensor* reserve_space,
+                        MetaConfig config = MetaConfig());
+
+void BatchNormInferInferMeta(const MetaTensor& x,
+                             const MetaTensor& scale,
+                             const MetaTensor& bias,
+                             const MetaTensor& mean,
+                             const MetaTensor& variance,
+                             float momentum,
+                             float epsilon,
+                             const std::string& data_layout,
+                             MetaTensor* y,
+                             MetaTensor* mean_out,
+                             MetaTensor* variance_out,
+                             MetaConfig config = MetaConfig());
+
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
@@ -47,32 +120,37 @@ void ConcatInferMeta(const std::vector<MetaTensor*>& x,
                      MetaTensor* out,
                      MetaConfig config = MetaConfig());
 
+void HierarchicalSigmoidInferMeta(const MetaTensor& x,
+                                  const MetaTensor& w,
+                                  const MetaTensor& label,
+                                  paddle::optional<const MetaTensor&> path,
+                                  paddle::optional<const MetaTensor&> code,
+                                  paddle::optional<const MetaTensor&> bias,
+                                  int num_classes,
+                                  bool remote_prefetch,
+                                  int trainer_id,
+                                  const std::vector<int64_t>& height_sections,
+                                  const std::vector<std::string>& epmap,
+                                  const std::vector<std::string>& table_names,
+                                  bool is_sparse,
+                                  MetaTensor* out,
+                                  MetaTensor* pre_out,
+                                  MetaTensor* w_out);
+
+void MultiDotInferMeta(const std::vector<MetaTensor*>& x, MetaTensor* out);
+
+void PsroiPoolInferMeta(const MetaTensor& x,
+                        const MetaTensor& rois,
+                        paddle::optional<const MetaTensor&> rois_num,
+                        int pooled_height,
+                        int pooled_width,
+                        int output_channels,
+                        float spatial_scale,
+                        MetaTensor* out);
+
 void WhereInferMeta(const MetaTensor& condition,
                     const MetaTensor& x,
                     const MetaTensor& y,
                     MetaTensor* out);
 
-void AdamaxInferMeta(const MetaTensor& param,
-                     const MetaTensor& grad,
-                     const MetaTensor& learning_rate,
-                     const MetaTensor& moment,
-                     const MetaTensor& inf_norm,
-                     const MetaTensor& beta1_pow,
-                     float beta1,
-                     float beta2,
-                     float epsilon,
-                     MetaTensor* param_out,
-                     MetaTensor* moment_out,
-                     MetaTensor* inf_norm_out);
-
-void AdadeltaInferMeta(const MetaTensor& param,
-                       const MetaTensor& grad,
-                       const MetaTensor& avg_squared_grad,
-                       const MetaTensor& avg_squared_update,
-                       float rho,
-                       float epsilon,
-                       MetaTensor* param_out,
-                       MetaTensor* avg_squared_grad_out,
-                       MetaTensor* avg_squared_update_out);
-
 }  // namespace phi
diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc
index 506d3fd14ea3f..081084567e840 100644
--- a/paddle/phi/infermeta/nullary.cc
+++ b/paddle/phi/infermeta/nullary.cc
@@ -16,6 +16,12 @@ limitations under the License. */
 
 namespace phi {
 
+void CreateInferMeta(const ScalarArray& shape,
+                     DataType dtype,
+                     MetaTensor* out) {
+  CreateInferMetaBase(shape.GetData(), dtype, DataLayout::NCHW, out);
+}
+
 void CreateInferMetaBase(const std::vector<int64_t>& shape,
                          DataType dtype,
                          DataLayout layout,
@@ -26,12 +32,6 @@ void CreateInferMetaBase(const std::vector<int64_t>& shape,
   out->set_layout(layout);
 }
 
-void CreateInferMeta(const ScalarArray& shape,
-                     DataType dtype,
-                     MetaTensor* out) {
-  CreateInferMetaBase(shape.GetData(), dtype, DataLayout::NCHW, out);
-}
-
 void EyeInferMeta(int64_t num_rows,
                   int64_t num_columns,
                   DataType dtype,
@@ -41,18 +41,6 @@ void EyeInferMeta(int64_t num_rows,
   out->set_dtype(dtype);
 }
 
-void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
-                                      float mean,
-                                      float std,
-                                      int seed,
-                                      DataType dtype,
-                                      MetaTensor* out) {
-  auto out_dims = phi::make_ddim(shape);
-  out->set_dims(out_dims);
-  out->set_dtype(dtype);
-  out->set_layout(DataLayout::NCHW);
-}
-
 void GaussianRandomInferMeta(const ScalarArray& shape,
                              float mean,
                              float std,
@@ -65,4 +53,16 @@ void GaussianRandomInferMeta(const ScalarArray& shape,
   out->set_layout(DataLayout::NCHW);
 }
 
+void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
+                                      float mean,
+                                      float std,
+                                      int seed,
+                                      DataType dtype,
+                                      MetaTensor* out) {
+  auto out_dims = phi::make_ddim(shape);
+  out->set_dims(out_dims);
+  out->set_dtype(dtype);
+  out->set_layout(DataLayout::NCHW);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h
index bd0567486e4d6..55e59b27e71cf 100644
--- a/paddle/phi/infermeta/nullary.h
+++ b/paddle/phi/infermeta/nullary.h
@@ -27,26 +27,21 @@ namespace phi {
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
+//
+// The InferMeta Functions in this file are arranged in alphabetic order.
+
+void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out);
 
 void CreateInferMetaBase(const std::vector<int64_t>& shape,
                          DataType dtype,
                          DataLayout layout,
                          MetaTensor* out);
 
-void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out);
-
 void EyeInferMeta(int64_t num_rows,
                   int64_t num_columns,
                   DataType dtype,
                   MetaTensor* out);
 
-void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
-                                      float mean,
-                                      float std,
-                                      int seed,
-                                      DataType dtype,
-                                      MetaTensor* out);
-
 void GaussianRandomInferMeta(const ScalarArray& shape,
                              float mean,
                              float std,
@@ -54,4 +49,11 @@ void GaussianRandomInferMeta(const ScalarArray& shape,
                              DataType dtype,
                              MetaTensor* out);
 
+void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
+                                      float mean,
+                                      float std,
+                                      int seed,
+                                      DataType dtype,
+                                      MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index eb807ad461511..556fb874470dd 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -18,6 +18,58 @@ limitations under the License. */
 
 namespace phi {
 
+void AccuracyInferMeta(const MetaTensor& out,
+                       const MetaTensor& indice,
+                       const MetaTensor& label,
+                       MetaTensor* accuracy,
+                       MetaTensor* correct,
+                       MetaTensor* total,
+                       MetaConfig config) {
+  auto inference_dim = out.dims();
+  auto label_dim = label.dims();
+  // Assume indices has same shape as inference, because
+  // it's the output of topk.
+  PADDLE_ENFORCE_EQ(
+      label_dim.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "ShapeError: label's dimensions of AccuracyOp must be 2. "
+          "But received label's dimensions = %d, label's shape = [%s]",
+          label_dim.size(),
+          label_dim));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(label_dim[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "ShapeError: label's second dimension of "
+                          "AccuracyOp must be 1. But received label's "
+                          "second dimension is = %d, label's shape = [%s]",
+                          label_dim[1],
+                          label_dim));
+    PADDLE_ENFORCE_EQ(
+        inference_dim[0],
+        label_dim[0],
+        phi::errors::InvalidArgument(
+            "ShapeError: the output's num_rows of AccuracyOp must be"
+            " the same as label's num_rows. But received output's "
+            "shape = [%s], label's shape = [%s], output's num_rows = %d, "
+            "label's "
+            "num_rows = %d",
+            inference_dim,
+            label_dim,
+            inference_dim[0],
+            label_dim[0]));
+  }
+
+  accuracy->set_dims({1});
+  accuracy->set_dtype(out.dtype());
+  correct->set_dims({1});
+  correct->set_dtype(out.dtype());
+  total->set_dims({1});
+  total->set_dtype(out.dtype());
+  accuracy->share_lod(out);
+}
+
 void AddmmInferMeta(const MetaTensor& input,
                     const MetaTensor& x,
                     const MetaTensor& y,
@@ -89,6 +141,339 @@ void AddmmInferMeta(const MetaTensor& input,
   out->set_dtype(input.dtype());
 }
 
+void GraphSendRecvInferMeta(const MetaTensor& x,
+                            const MetaTensor& src_index,
+                            const MetaTensor& dst_index,
+                            const std::string& pool_type,
+                            MetaTensor* out,
+                            MetaTensor* dst_count) {
+  auto src_index_dims = src_index.dims();
+  if (src_index_dims.size() == 2) {
+    PADDLE_ENFORCE_EQ(src_index_dims[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The last dim of Src_index should be 1 when it "
+                          "is 2D, but we get %d",
+                          src_index_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        src_index_dims.size(),
+        1,
+        phi::errors::InvalidArgument(
+            "The Src_index should be 1D, when it is not 2D, but we get %d",
+            src_index_dims.size()));
+  }
+
+  auto dst_index_dims = dst_index.dims();
+  if (dst_index_dims.size() == 2) {
+    PADDLE_ENFORCE_EQ(dst_index_dims[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The last dim of Dst_index should be 1 when it "
+                          "is 2D, but we get %d",
+                          dst_index_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        dst_index_dims.size(),
+        1,
+        phi::errors::InvalidArgument("The Dst_index should be 1D, "
+                                     "when it is not 2D, but we get %d",
+                                     dst_index_dims.size()));
+  }
+
+  PADDLE_ENFORCE_EQ(src_index_dims[0],
+                    dst_index_dims[0],
+                    phi::errors::InvalidArgument(
+                        "Src_index and Dst_index should have the same shape."));
+
+  auto dims = x.dims();
+  out->set_dims(dims);
+  out->set_dtype(x.dtype());
+
+  if (pool_type == "MEAN") {
+    dst_count->set_dims({dims[0]});
+    dst_count->set_dtype(DataType::INT32);
+  }
+}
+
+void LerpInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   const MetaTensor& weight,
+                   MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  auto w_dims = weight.dims();
+  DDim out_dims;
+  out_dims = funcs::GetOutputDims(x_dims, y_dims);
+  if (w_dims.size() > 1 || w_dims[0] != 1) {
+    out_dims = funcs::GetOutputDims(out_dims, w_dims);
+  }
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+  out->share_lod(x);
+}
+
+void LinspaceInferMeta(const MetaTensor& start,
+                       const MetaTensor& stop,
+                       const MetaTensor& number,
+                       MetaTensor* out) {
+  auto s_dims = start.dims();
+  PADDLE_ENFORCE_EQ(
+      (s_dims.size() == 1) && (s_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Start) must be [1],"
+                                   "but received input shape is [%s].",
+                                   s_dims));
+  auto e_dims = stop.dims();
+  PADDLE_ENFORCE_EQ(
+      (e_dims.size() == 1) && (e_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Stop) must be [1],"
+                                   "but received input shape is [%s].",
+                                   e_dims));
+  auto step_dims = number.dims();
+  PADDLE_ENFORCE_EQ(
+      (step_dims.size() == 1) && (step_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Num) must be [1],"
+                                   "but received input shape is [%s].",
+                                   step_dims));
+  out->set_dims(phi::make_ddim({-1}));
+  out->set_dtype(start.dtype());
+}
+
+void NllLossRawInferMeta(const MetaTensor& input,
+                         const MetaTensor& label,
+                         paddle::optional<const MetaTensor&> weight,
+                         int64_t ignore_index,
+                         const std::string& reduction,
+                         MetaTensor* out,
+                         MetaTensor* total_weight,
+                         MetaConfig config) {
+  auto x_dims = input.dims();
+  auto label_dims = label.dims();
+  PADDLE_ENFORCE_EQ(x_dims.size() == 2 || x_dims.size() == 4,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "The tensor rank of Input(X) must be 2 or 4."));
+  bool contain_unknown_dim =
+      phi::contain_unknown_dim(x_dims) || phi::contain_unknown_dim(label_dims);
+  bool check = config.is_runtime || !contain_unknown_dim;
+  if (check) {
+    PADDLE_ENFORCE_EQ(
+        x_dims[0],
+        label_dims[0],
+        phi::errors::InvalidArgument(
+            "ShapeError: Expected input batch_size to match label batch_size,"
+            "But received: the Input(x) batch_size is [%s], the Input(label) "
+            " batch_size is [%s].",
+            x_dims[0],
+            label_dims[0]));
+    if (weight.get_ptr() != nullptr) {
+      auto w_dims = weight->dims();
+      PADDLE_ENFORCE_EQ(
+          w_dims.size(),
+          1,
+          phi::errors::InvalidArgument("Input(Weight) should be a 1D tensor."));
+      PADDLE_ENFORCE_EQ(
+          x_dims[1],
+          w_dims[0],
+          phi::errors::InvalidArgument(
+              "Expected input tensor Weight's size should equal "
+              "to the first dimension of the input tensor X. But received "
+              "Weight's "
+              "size is %d, the first dimension of input X is %d",
+              w_dims[0],
+              x_dims[1]));
+    }
+  }
+  if (x_dims.size() == 2) {
+    if (reduction == "none") {
+      out->set_dims({x_dims[0]});
+    } else {
+      out->set_dims({1});
+    }
+  } else if (x_dims.size() == 4) {
+    PADDLE_ENFORCE_EQ(label_dims.size(),
+                      3,
+                      phi::errors::InvalidArgument(
+                          "Expected Input(Lable) dimensions=3, received %d.",
+                          label_dims.size()));
+    auto input0 = x_dims[0];
+    auto input2 = x_dims[2];
+    auto input3 = x_dims[3];
+    auto label0 = label_dims[0];
+    auto label1 = label_dims[1];
+    auto label2 = label_dims[2];
+    PADDLE_ENFORCE_EQ(
+        input0 == label0 && input2 == label1 && input3 == label2,
+        true,
+        phi::errors::InvalidArgument("Input(X) tensor shape should "
+                                     "match to Input(Label) tensor "
+                                     "shape."));
+    if (reduction == "none") {
+      out->set_dims({x_dims[0], x_dims[2], x_dims[3]});
+    } else {
+      out->set_dims({1});
+    }
+  }
+  total_weight->set_dims({1});
+  out->set_dtype(input.dtype());
+  total_weight->set_dtype(input.dtype());
+}
+
+void RoiAlignInferMeta(const MetaTensor& x,
+                       const MetaTensor& boxes,
+                       paddle::optional<const MetaTensor&> boxes_num,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       int sampling_ratio,
+                       bool aligned,
+                       MetaTensor* out,
+                       MetaConfig config) {
+  auto input_dims = x.dims();
+  auto boxes_dims = boxes.dims();
+
+  if (boxes_num) {
+    auto boxes_num_dims = boxes_num->dims();
+    PADDLE_ENFORCE_EQ(
+        boxes_num_dims.size(),
+        1,
+        phi::errors::InvalidArgument("The size of boxes_num should be 1"
+                                     ", but received size = %d",
+                                     boxes_num_dims.size()));
+  }
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The format of Input(x) in"
+                        "RoiAlignOp is NCHW. And the rank of input must be 4. "
+                        "But received rank = %d",
+                        input_dims.size()));
+  PADDLE_ENFORCE_EQ(boxes_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument("The rank of Input(boxes) "
+                                                 "in RoiAlignOp should be 2. "
+                                                 "But the rank of boxes is %d",
+                                                 boxes_dims.size()));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(boxes_dims[1],
+                      4,
+                      phi::errors::InvalidArgument(
+                          "The second dimension "
+                          "of Input(boxes) should be 4. But received the "
+                          "dimension = %d",
+                          boxes_dims[1]));
+  }
+
+  PADDLE_ENFORCE_GT(pooled_height,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The 'pooled_height' attribute in RoiAlignOp is "
+                        "invalid. The height must be greater than 0. But "
+                        "received 'pooled_height' = %d",
+                        pooled_height));
+  PADDLE_ENFORCE_GT(pooled_width,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The 'pooled_width' attribute in RoiAlignOp is "
+                        "invalid. The width must be greater than 0. But "
+                        "received 'pooled_width' = %d",
+                        pooled_width));
+  PADDLE_ENFORCE_GT(spatial_scale,
+                    0.0f,
+                    phi::errors::InvalidArgument(
+                        "The 'spatial_scale' attribute in RoiAlignOp is "
+                        "invalid. The scale must be greater than 0. But "
+                        "received 'spatial_scale' = %f",
+                        spatial_scale));
+
+  auto out_dims = input_dims;
+  out_dims[0] = boxes_dims[0];
+  out_dims[1] = input_dims[1];
+  out_dims[2] = pooled_height;
+  out_dims[3] = pooled_width;
+
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+}
+
+void RoiPoolInferMeta(const MetaTensor& x,
+                      const MetaTensor& boxes,
+                      paddle::optional<const MetaTensor&> boxes_num,
+                      int pooled_height,
+                      int pooled_width,
+                      float spatial_scale,
+                      MetaTensor* out,
+                      MetaTensor* arg_max) {
+  auto input_dims = x.dims();
+  auto boxes_dims = boxes.dims();
+
+  if (boxes_num) {
+    auto boxes_num_dims = boxes_num->dims();
+    PADDLE_ENFORCE_EQ(
+        boxes_num_dims.size(),
+        1,
+        phi::errors::InvalidArgument("The second dimension of boxes_num should "
+                                     "be 1, but received dimension is %d",
+                                     boxes_num_dims.size()));
+  }
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The input data should be a four-dimensional "
+                        "tensor with [N,C,H,W], but received input data with "
+                        " %d dimension",
+                        input_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      boxes_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "boxes should be a 2-D LoDTensor with shape (num_boxes, 4)"
+          "given as [[x1, y1, x2, y2], ...], but received boxes is "
+          "%d-dimensional LoDTensor",
+          boxes_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      boxes_dims[1],
+      4,
+      phi::errors::InvalidArgument(
+          "boxes should be a 2-D LoDTensor with shape (num_boxes, 4)"
+          "given as [[x1, y1, x2, y2], ...]. But the second dimension of  "
+          "the received data is %d",
+          boxes_dims[1]));
+
+  PADDLE_ENFORCE_GT(
+      pooled_height,
+      0,
+      phi::errors::OutOfRange("The pooled output height must be greater than 0"
+                              "but received height is %d",
+                              pooled_height));
+  PADDLE_ENFORCE_GT(
+      pooled_width,
+      0,
+      phi::errors::OutOfRange("The pooled output width must be greater than 0"
+                              "but received width is %d",
+                              pooled_width));
+  PADDLE_ENFORCE_GT(
+      spatial_scale,
+      0.0f,
+      phi::errors::OutOfRange("The spatial scale must be greater than 0, "
+                              "but received spatial scale is %f",
+                              spatial_scale));
+
+  auto out_dims = input_dims;
+  out_dims[0] = boxes_dims[0];
+  out_dims[1] = input_dims[1];
+  out_dims[2] = pooled_height;
+  out_dims[3] = pooled_width;
+
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+  arg_max->set_dims(out_dims);
+  arg_max->set_dtype(DataType::INT64);
+}
+
 void ScatterInferMeta(const MetaTensor& x,
                       const MetaTensor& index,
                       const MetaTensor& updates,
@@ -192,50 +577,51 @@ void ScatterNdAddInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
-void LerpInferMeta(const MetaTensor& x,
-                   const MetaTensor& y,
-                   const MetaTensor& weight,
-                   MetaTensor* out) {
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-  auto w_dims = weight.dims();
-  DDim out_dims;
-  out_dims = funcs::GetOutputDims(x_dims, y_dims);
-  if (w_dims.size() > 1 || w_dims[0] != 1) {
-    out_dims = funcs::GetOutputDims(out_dims, w_dims);
-  }
-  out->set_dims(out_dims);
-  out->set_dtype(x.dtype());
-  out->share_lod(x);
-}
-
-void LinspaceInferMeta(const MetaTensor& start,
-                       const MetaTensor& stop,
-                       const MetaTensor& number,
-                       MetaTensor* out) {
-  auto s_dims = start.dims();
-  PADDLE_ENFORCE_EQ(
-      (s_dims.size() == 1) && (s_dims[0] == 1),
-      true,
-      phi::errors::InvalidArgument("The shape of Input(Start) must be [1],"
-                                   "but received input shape is [%s].",
-                                   s_dims));
-  auto e_dims = stop.dims();
-  PADDLE_ENFORCE_EQ(
-      (e_dims.size() == 1) && (e_dims[0] == 1),
-      true,
-      phi::errors::InvalidArgument("The shape of Input(Stop) must be [1],"
-                                   "but received input shape is [%s].",
-                                   e_dims));
-  auto step_dims = number.dims();
+void ViterbiDecodeInferMeta(const MetaTensor& input,
+                            const MetaTensor& transition,
+                            const MetaTensor& length,
+                            bool include_bos_eos_tag,
+                            MetaTensor* scores,
+                            MetaTensor* path,
+                            MetaConfig config) {
+  auto in_dims = input.dims();
+  PADDLE_ENFORCE_EQ(in_dims.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The rank of Input in ViterbiDecode  must be 3. But "
+                        "received Input's rank is %d.",
+                        in_dims.size()));
+  auto length_dims = length.dims();
+  PADDLE_ENFORCE_EQ(length_dims.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The rank of Length in ViterbiDecode must be 1. But "
+                        "received Length's rank is %d.",
+                        length_dims.size()));
+  auto transition_dims = transition.dims();
   PADDLE_ENFORCE_EQ(
-      (step_dims.size() == 1) && (step_dims[0] == 1),
-      true,
-      phi::errors::InvalidArgument("The shape of Input(Num) must be [1],"
-                                   "but received input shape is [%s].",
-                                   step_dims));
-  out->set_dims(phi::make_ddim({-1}));
-  out->set_dtype(start.dtype());
+      transition_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The rank of Transition in ViterbiDecode must be 2. But "
+          "received Transition's rank is %d.",
+          transition_dims.size()));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(
+        in_dims[0],
+        length_dims[0],
+        phi::errors::InvalidArgument(
+            "The batch size of Input and Length should be equal."));
+    PADDLE_ENFORCE_EQ(in_dims[2],
+                      transition_dims[0],
+                      phi::errors::InvalidArgument(
+                          "The number of tags of Input (%d) and Transition "
+                          "(%d) should be equal.",
+                          transition_dims[0],
+                          in_dims[2]));
+  }
+  scores->set_dims(length_dims);
+  scores->set_dtype(length.dtype());
 }
 
 }  // namespace phi
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index 4dec14425166f..42a0f35dc1d8d 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -29,6 +29,16 @@ namespace phi {
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
+//
+// The InferMeta Functions in this file are arranged in alphabetic order.
+
+void AccuracyInferMeta(const MetaTensor& out,
+                       const MetaTensor& indice,
+                       const MetaTensor& label,
+                       MetaTensor* accuracy,
+                       MetaTensor* correct,
+                       MetaTensor* total,
+                       MetaConfig config = MetaConfig());
 
 void AddmmInferMeta(const MetaTensor& input,
                     const MetaTensor& x,
@@ -37,10 +47,51 @@ void AddmmInferMeta(const MetaTensor& input,
                     float beta,
                     MetaTensor* out);
 
-void GatherNdGradInferMeta(const MetaTensor& x,
-                           const MetaTensor& index,
-                           const MetaTensor& out_grad,
-                           MetaTensor* x_grad);
+void GraphSendRecvInferMeta(const MetaTensor& x,
+                            const MetaTensor& src_index,
+                            const MetaTensor& dst_index,
+                            const std::string& pool_type,
+                            MetaTensor* out,
+                            MetaTensor* dst_count);
+
+void LerpInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   const MetaTensor& weight,
+                   MetaTensor* out);
+
+void LinspaceInferMeta(const MetaTensor& start,
+                       const MetaTensor& stop,
+                       const MetaTensor& number,
+                       MetaTensor* out);
+
+void NllLossRawInferMeta(const MetaTensor& input,
+                         const MetaTensor& label,
+                         paddle::optional<const MetaTensor&> weight,
+                         int64_t ignore_index,
+                         const std::string& reduction,
+                         MetaTensor* out,
+                         MetaTensor* total_weight,
+                         MetaConfig config = MetaConfig());
+
+void RoiAlignInferMeta(const MetaTensor& x,
+                       const MetaTensor& boxes,
+                       paddle::optional<const MetaTensor&> boxes_num,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       int sampling_ratio,
+                       bool aligned,
+                       MetaTensor* out,
+                       MetaConfig config = MetaConfig());
+
+void RoiPoolInferMeta(const MetaTensor& x,
+                      const MetaTensor& boxes,
+                      paddle::optional<const MetaTensor&> boxes_num,
+                      int pooled_height,
+                      int pooled_width,
+                      float spatial_scale,
+                      MetaTensor* out,
+                      MetaTensor* arg_max);
 
 void ScatterInferMeta(const MetaTensor& x,
                       const MetaTensor& index,
@@ -53,14 +104,12 @@ void ScatterNdAddInferMeta(const MetaTensor& x,
                            const MetaTensor& updates,
                            MetaTensor* out);
 
-void LerpInferMeta(const MetaTensor& x,
-                   const MetaTensor& y,
-                   const MetaTensor& weight,
-                   MetaTensor* out);
-
-void LinspaceInferMeta(const MetaTensor& start,
-                       const MetaTensor& stop,
-                       const MetaTensor& number,
-                       MetaTensor* out);
+void ViterbiDecodeInferMeta(const MetaTensor& input,
+                            const MetaTensor& transition,
+                            const MetaTensor& length,
+                            bool include_bos_eos_tag,
+                            MetaTensor* scores,
+                            MetaTensor* path,
+                            MetaConfig config = MetaConfig());
 
 }  // namespace phi
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 544a5593014f4..7c5f38744f892 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -17,102 +17,118 @@ limitations under the License. */
 #include <algorithm>
 #include <set>
 
+#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/unfold_functor.h"
 
 namespace phi {
 
-void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) {
-  out->share_meta(x);
-}
+void ArgMinMaxInferMeta(const MetaTensor& x,
+                        int64_t axis,
+                        bool keepdims,
+                        bool flatten,
+                        int dtype,
+                        MetaTensor* out,
+                        MetaConfig config) {
+  const auto& x_dims = x.dims();
 
-// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1]
-void UnchangedInferMetaCheckAxis(const MetaTensor& x,
-                                 int axis,
-                                 MetaTensor* out) {
-  auto rank = x.dims().size();
   PADDLE_ENFORCE_GE(
       axis,
-      -rank,
-      errors::InvalidArgument(
-          "Attr(axis) value should be in range [-R, R-1], "
-          "R is the rank of Input(X). But received axis: %d, R: %d.",
-          axis,
-          rank));
-  PADDLE_ENFORCE_LT(
-      axis,
-      rank,
-      phi::errors::InvalidArgument(
-          "Attr(axis) value should be in range [-R, R-1], "
-          "R is the rank of Input(X). But received axis: %d, R: %d.",
-          axis,
-          rank));
-  out->share_meta(x);
-}
-
-void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) {
-  out->set_dims(x.dims());
-  out->set_dtype(dtype::ToReal(x.dtype()));
-  out->set_layout(x.layout());
-}
-
-void FlattenInferMeta(const MetaTensor& x,
-                      int start_axis,
-                      int stop_axis,
-                      MetaTensor* out) {
-  auto x_dims = x.dims();
-  int in_dims_size = x_dims.size();
-  if (start_axis < 0) {
-    start_axis = start_axis + in_dims_size;
-  }
-  if (stop_axis < 0) {
-    stop_axis = stop_axis + in_dims_size;
-  }
-  PADDLE_ENFORCE_GE(
-      stop_axis,
-      start_axis,
-      phi::errors::InvalidArgument("The stop_axis should be greater"
-                                   "than or equal to start_axis."));
+      -x_dims.size(),
+      phi::errors::InvalidArgument("'axis'(%d) must be greater than or equal to"
+                                   " -Rank(X)(%d).",
+                                   axis,
+                                   -x_dims.size()));
+  PADDLE_ENFORCE_LT(axis,
+                    x_dims.size(),
+                    phi::errors::InvalidArgument(
+                        "'axis'(%d) must be less than Rank(X)(%d) of Input(X).",
+                        axis,
+                        x_dims.size()));
 
-  int64_t outer = 1;
-  std::vector<int32_t> out_shape;
-  out_shape.reserve(in_dims_size - stop_axis + start_axis);
+  PADDLE_ENFORCE_EQ(
+      (dtype < 0 || dtype == 2 || dtype == 3),
+      true,
+      phi::errors::InvalidArgument(
+          "The attribute of dtype in argmin/argmax must be [%s] or [%s], but "
+          "received [%s]",
+          paddle::framework::DataTypeToString(
+              paddle::framework::proto::VarType::INT32),
+          paddle::framework::DataTypeToString(
+              paddle::framework::proto::VarType::INT64),
+          paddle::framework::DataTypeToString(
+              static_cast<paddle::framework::proto::VarType::Type>(dtype))));
+
+  auto x_rank = x_dims.size();
+  if (axis < 0) axis += x_rank;
+  if (config.is_runtime) {
+    if (dtype == paddle::framework::proto::VarType::INT32) {
+      int64_t all_element_num = 0;
+      if (flatten) {
+        all_element_num = phi::product(x_dims);
 
-  for (int i = 0; i < start_axis; ++i) {
-    out_shape.push_back(x_dims[i]);
-  }
-  for (int i = start_axis; i <= stop_axis; i++) {
-    if (x_dims[i] == -1 || outer == -1) {
-      outer = -1;
-    } else {
-      outer *= x_dims[i];
+      } else {
+        all_element_num = x_dims[axis];
+      }
+      PADDLE_ENFORCE_LE(
+          all_element_num,
+          INT_MAX,
+          phi::errors::InvalidArgument(
+              "The element num of the argmin/argmax input at axis is "
+              "%d, is larger than int32 maximum value:%d, you must "
+              "set the dtype of argmin/argmax to 'int64'.",
+              all_element_num,
+              INT_MAX));
     }
   }
-  out_shape.push_back(outer);
-  for (int i = stop_axis + 1; i < in_dims_size; i++) {
-    out_shape.push_back(x_dims[i]);
+  std::vector<int64_t> vec;
+  if (flatten) {
+    vec.emplace_back(static_cast<int64_t>(1));
+  } else {
+    for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
+    if (keepdims) {
+      vec.emplace_back(static_cast<int64_t>(1));
+    }
+    for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]);
   }
-  const auto& out_dims = phi::make_ddim(out_shape);
-  out->set_dims(out_dims);
-  out->set_dtype(x.dtype());
-  out->set_layout(x.layout());
-
-  if (x_dims[0] == out_dims[0]) {
-    // Only pass LoD when the first dimension of output and Input(X)
-    // are the same.
-    out->share_lod(x);
+  out->set_dims(phi::make_ddim(vec));
+  if (dtype == 2) {
+    out->set_dtype(DataType::INT32);
+  } else if (dtype == 3) {
+    out->set_dtype(DataType::INT64);
   }
 }
 
-void GumbelSoftmaxInferMeta(const MetaTensor& x,
-                            float temperature,
-                            bool hard,
-                            int axis,
-                            MetaTensor* out) {
-  UnchangedInferMetaCheckAxis(x, axis, out);
+void ArgsortInferMeta(const MetaTensor& input,
+                      int axis,
+                      bool descending,
+                      MetaTensor* output,
+                      MetaTensor* indices) {
+  auto in_dims = input.dims();
+  auto num_dims = in_dims.size();
+  PADDLE_ENFORCE_GE(
+      axis,
+      -num_dims,
+      phi::errors::InvalidArgument("'axis'(%d) must be greater than or equal to"
+                                   " -num_dims(%d).",
+                                   axis,
+                                   -num_dims));
+  PADDLE_ENFORCE_LT(
+      axis,
+      num_dims,
+      phi::errors::InvalidArgument(
+          "'axis'(%d) must be less than num_dims(%d).", axis, num_dims));
+
+  output->share_dims(input);
+  output->set_dtype(input.dtype());
+  indices->share_dims(input);
+  indices->set_dtype(DataType::INT64);
+  output->share_lod(input);
+  indices->share_lod(input);
 }
 
 void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out) {
@@ -174,6 +190,239 @@ void CumsumInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+void DiagInferMeta(const MetaTensor& x,
+                   int offset,
+                   float padding_value,
+                   MetaTensor* out) {
+  auto x_dims = x.dims();
+
+  if (x_dims.size() == 1UL) {
+    int64_t size_ = x_dims[0] + std::abs(offset);
+    out->set_dims({size_, size_});
+    out->set_dtype(x.dtype());
+  } else if (x_dims.size() == 2UL) {
+    int64_t size_ = 0;
+    if (offset >= 0) {
+      // Note(LutaoChu): Do not use std::min here, otherwise the calculation
+      // of `size_` will have unexpected result on Windows Python3.8
+      if (x_dims[0] < x_dims[1] - offset) {
+        size_ = x_dims[0];
+      } else {
+        size_ = x_dims[1] - offset;
+      }
+    } else {
+      // Note(LutaoChu): Do not use std::min here, otherwise the calculation
+      // of `size_` will have unexpected result on Windows Python3.8
+      if (x_dims[0] + offset < x_dims[1]) {
+        size_ = x_dims[0] + offset;
+      } else {
+        size_ = x_dims[1];
+      }
+    }
+    out->set_dims({size_});
+    out->set_dtype(x.dtype());
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "The input tensor X's dimensions of DiagV2Op should be either 1 or "
+        "2, but received %d.",
+        x_dims.size()));
+  }
+}
+
+void DiagonalInferMeta(const MetaTensor& input,
+                       int offset,
+                       int axis1,
+                       int axis2,
+                       MetaTensor* out) {
+  auto x_dims = input.dims();
+  int offset_ = offset;
+  int axis1_ = axis1 < 0 ? x_dims.size() + axis1 : axis1;
+  int axis2_ = axis2 < 0 ? x_dims.size() + axis2 : axis2;
+
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::OutOfRange("Input's dim is out of range (expected at "
+                              "least 2 dimensions, but got %ld).",
+                              x_dims.size()));
+  PADDLE_ENFORCE_LT(
+      axis1_,
+      x_dims.size(),
+      phi::errors::OutOfRange(
+          "Attr(axis1) is out of range (expected to be in range of [%ld, "
+          "%ld], but got %ld).",
+          -(x_dims.size()),
+          (x_dims.size() - 1),
+          axis1));
+  PADDLE_ENFORCE_LT(
+      axis2_,
+      x_dims.size(),
+      phi::errors::OutOfRange(
+          "Attr(axis2) is out of range (expected to be in range of [%ld, "
+          "%ld], but got %ld).",
+          -(x_dims.size()),
+          (x_dims.size() - 1),
+          axis2));
+  PADDLE_ENFORCE_NE(
+      axis1_,
+      axis2_,
+      phi::errors::InvalidArgument("The dimensions should not be identical "
+                                   "%d vs %d.",
+                                   axis1,
+                                   axis2));
+
+  auto out_dims = vectorize(x_dims);
+  // from out_dims get the dim size of axis1_.
+  auto axis1_size = out_dims[axis1_];
+  auto axis2_size = out_dims[axis2_];
+  // delete two dims by attr axis1 and axis2 from out_dims.
+  /* example:
+     out_dim = [2, 3, 4];
+     axis1 = 0;
+     axis2 = 1;
+     according to the attr of axis1 and axis2, we get:
+     out_dim = [4].
+  */
+  out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_));
+  out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_));
+
+  if (offset_ == 0) {
+    out_dims.push_back(std::min(axis1_size, axis2_size));
+  } else if (offset_ > 0) {
+    if ((axis2_size - offset_) > 0) {
+      out_dims.push_back(std::min(axis1_size, axis2_size - offset_));
+    } else {
+      out_dims.push_back(0);
+    }
+  } else {
+    if ((axis1_size + offset_) > 0) {
+      out_dims.push_back(std::min(axis1_size + offset_, axis2_size));
+    } else {
+      out_dims.push_back(0);
+    }
+  }
+  out->set_dims(phi::make_ddim(out_dims));
+}
+
+void DropoutInferMeta(const MetaTensor& x, MetaTensor* out, MetaTensor* mask) {
+  auto x_dims = x.dims();
+  out->set_dims(x_dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+
+  if (mask != nullptr) {
+    mask->set_dims(x_dims);
+  }
+}
+
+void EighInferMeta(const MetaTensor& x,
+                   const std::string& uplo,
+                   MetaTensor* out_w,
+                   MetaTensor* out_v) {
+  auto input_dim = x.dims();
+  auto rank = input_dim.size();
+
+  PADDLE_ENFORCE_GE(rank,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The Input(X) should have at least 2 dimensions."
+                        "But received a %d dimension tensor.",
+                        rank));
+  PADDLE_ENFORCE_EQ(
+      input_dim[rank - 2],
+      input_dim[rank - 1],
+      phi::errors::InvalidArgument(
+          "Eigh op is designed for square matrix, consequently"
+          "inner-most 2 dimensions of Input(X) should be symmetric."
+          "But received X's shape[-2] = %d and shape[-1] = %d.",
+          input_dim[rank - 2],
+          input_dim[rank - 1]));
+
+  std::vector<int64_t> values_dim;
+
+  for (auto i = 0; i < rank - 1; i++) {
+    values_dim.emplace_back(input_dim[i]);
+  }
+  out_w->set_dims(phi::make_ddim(values_dim));
+  out_v->set_dims(input_dim);
+}
+
+void FlattenInferMeta(const MetaTensor& x,
+                      int start_axis,
+                      int stop_axis,
+                      MetaTensor* out) {
+  auto x_dims = x.dims();
+  int in_dims_size = x_dims.size();
+  if (start_axis < 0) {
+    start_axis = start_axis + in_dims_size;
+  }
+  if (stop_axis < 0) {
+    stop_axis = stop_axis + in_dims_size;
+  }
+  PADDLE_ENFORCE_GE(
+      stop_axis,
+      start_axis,
+      phi::errors::InvalidArgument("The stop_axis should be greater"
+                                   "than or equal to start_axis."));
+
+  int64_t outer = 1;
+  std::vector<int32_t> out_shape;
+  out_shape.reserve(in_dims_size - stop_axis + start_axis);
+
+  for (int i = 0; i < start_axis; ++i) {
+    out_shape.push_back(x_dims[i]);
+  }
+  for (int i = start_axis; i <= stop_axis; i++) {
+    if (x_dims[i] == -1 || outer == -1) {
+      outer = -1;
+    } else {
+      outer *= x_dims[i];
+    }
+  }
+  out_shape.push_back(outer);
+  for (int i = stop_axis + 1; i < in_dims_size; i++) {
+    out_shape.push_back(x_dims[i]);
+  }
+  const auto& out_dims = phi::make_ddim(out_shape);
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+
+  if (x_dims[0] == out_dims[0]) {
+    // Only pass LoD when the first dimension of output and Input(X)
+    // are the same.
+    out->share_lod(x);
+  }
+}
+
+void GumbelSoftmaxInferMeta(const MetaTensor& x,
+                            float temperature,
+                            bool hard,
+                            int axis,
+                            MetaTensor* out) {
+  UnchangedInferMetaCheckAxis(x, axis, out);
+}
+
+void HistogramInferMeta(
+    const MetaTensor& input, int64_t bins, int min, int max, MetaTensor* out) {
+  PADDLE_ENFORCE_GE(bins,
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The bins should be greater than or equal to 1."
+                        "But received nbins is %d",
+                        bins));
+  PADDLE_ENFORCE_GE(
+      max,
+      min,
+      phi::errors::InvalidArgument("max must be larger or equal to min."
+                                   "But received max is %d, min is %d",
+                                   max,
+                                   min));
+
+  out->set_dims({bins});
+  out->share_lod(input);
+}
+
 void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) {
   PADDLE_ENFORCE_EQ(
       product(x.dims()),
@@ -303,32 +552,235 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
             capacity));
   }
 
-  return phi::make_ddim(output_shape);
+  return phi::make_ddim(output_shape);
+}
+
+void InferMetaFromVecValue(const MetaTensor& x,
+                           const std::vector<int64_t>& shape,
+                           MetaTensor* out) {
+  PADDLE_ENFORCE_EQ(!shape.empty(),
+                    true,
+                    phi::errors::InvalidArgument(
+                        "The parameter 'shape' in ReshapeOp must be set. "
+                        "But received 'shape' is empty."));
+  auto x_dims = x.dims();
+  auto out_dims = ValidateShape(shape, x_dims);
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  if (x_dims[0] == out_dims[0]) {
+    // Only pass LoD when the first dimension of output and Input(X)
+    // are the same.
+    out->share_lod(x);
+  }
+}
+
+void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->set_dims(phi::make_ddim({1}));
+  out->set_dtype(DataType::BOOL);
+}
+
+void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(DataType::BOOL);
+}
+
+void KthvalueInferMeta(const MetaTensor& x,
+                       int k,
+                       int axis,
+                       bool keepdim,
+                       MetaTensor* out,
+                       MetaTensor* indices,
+                       MetaConfig config) {
+  auto input_dims = x.dims();
+  const int& dim_size = input_dims.size();
+  PADDLE_ENFORCE_LT(axis,
+                    dim_size,
+                    phi::errors::InvalidArgument(
+                        "the axis must be [-%d, %d), but received %d .",
+                        dim_size,
+                        dim_size,
+                        axis));
+  PADDLE_ENFORCE_GE(axis,
+                    -dim_size,
+                    phi::errors::InvalidArgument(
+                        "the axis must be [-%d, %d), but received %d .",
+                        dim_size,
+                        dim_size,
+                        axis));
+  if (axis < 0) axis += dim_size;
+  PADDLE_ENFORCE_GE(
+      k,
+      1,
+      phi::errors::InvalidArgument(
+          "the k in the kthvalue must >= 1, but received %d .", k));
+  PADDLE_ENFORCE_GE(
+      input_dims.size(),
+      1,
+      phi::errors::InvalidArgument("input of kthvalue must have >= 1d shape"));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_GE(
+        input_dims[axis],
+        k,
+        phi::errors::InvalidArgument(
+            "input of kthvalue must have >= %d columns in axis of %d",
+            k,
+            axis));
+  }
+  std::vector<int64_t> dimvec;
+  for (int64_t i = 0; i < axis; i++) {
+    dimvec.emplace_back(input_dims[i]);
+  }
+  if (keepdim) {
+    dimvec.emplace_back(static_cast<int64_t>(1));
+  }
+  for (int64_t i = axis + 1; i < dim_size; i++) {
+    dimvec.emplace_back(input_dims[i]);
+  }
+  DDim dims = phi::make_ddim(dimvec);
+  out->set_dims(dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+  indices->set_dims(dims);
+  indices->share_lod(x);
+  indices->set_dtype(x.dtype());
+}
+
+void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out) {
+  auto dims = x.dims();
+  auto n_dim = dims.size();
+  PADDLE_ENFORCE_GE(n_dim,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The Input(X) should have at least 2 dimensions. But "
+                        "received a %d dimension tensor.",
+                        n_dim));
+  PADDLE_ENFORCE_EQ(dims[n_dim - 2],
+                    dims[n_dim - 1],
+                    phi::errors::InvalidArgument(
+                        "The inner-most 2 dimensions of Input(X) all should "
+                        "be square matrices "
+                        "But received X's shape[-2] = %d and shape[-1] = %d.",
+                        dims[n_dim - 2],
+                        dims[n_dim - 1]));
+  out->set_dims(dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
+void MaxPoolWithIndexInferMeta(const MetaTensor& x,
+                               const std::vector<int>& kernel_size,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& paddings,
+                               bool global_pooling,
+                               bool adaptive,
+                               MetaTensor* out,
+                               MetaTensor* mask,
+                               MetaConfig config) {
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  auto x_dims = x.dims();
+
+  PADDLE_ENFORCE(
+      x_dims.size() == 4 || x_dims.size() == 5,
+      errors::InvalidArgument(
+          "Pooling intput should be 4-D or 5-D tensor but received %dD-Tensor",
+          x_dims.size()));
+
+  if (global_pooling) {
+    kernel_size_.resize(static_cast<size_t>(x_dims.size()) - 2);
+    for (size_t i = 0; i < kernel_size_.size(); ++i) {
+      paddings_[i] = 0;
+      kernel_size_[i] = static_cast<int>(x_dims[i + 2]);
+    }
+  }
+
+  PADDLE_ENFORCE_EQ(
+      x_dims.size() - kernel_size_.size(),
+      2U,
+      errors::InvalidArgument(
+          "The input size %d minus the kernel size %d should equal to 2.",
+          x_dims.size(),
+          kernel_size_.size()));
+  PADDLE_ENFORCE_EQ(
+      kernel_size_.size(),
+      strides.size(),
+      errors::InvalidArgument(
+          "Strides size %d and pooling size %d should be the same.",
+          strides.size(),
+          kernel_size_.size()));
+  PADDLE_ENFORCE_EQ(
+      kernel_size_.size(),
+      paddings_.size(),
+      errors::InvalidArgument(
+          "Paddings size %d and pooling size %d should be the same.",
+          paddings_.size(),
+          kernel_size_.size()));
+
+  std::vector<int64_t> output_shape({x_dims[0], x_dims[1]});
+  if (adaptive) {
+    output_shape.insert(
+        output_shape.end(), kernel_size_.begin(), kernel_size_.end());
+  } else {
+    for (size_t i = 0; i < kernel_size_.size(); ++i) {
+      if ((!config.is_runtime) && (x_dims[i + 2] < 0)) {
+        output_shape.push_back(x_dims[i + 2]);
+      } else {
+        output_shape.push_back(funcs::MaxPoolOutputSize(
+            x_dims[i + 2], kernel_size_[i], paddings_[i], strides[i]));
+      }
+    }
+  }
+
+  out->set_dims(make_ddim(output_shape));
+  out->set_dtype(x.dtype());
+
+  mask->set_dims(make_ddim(output_shape));
+  mask->set_dtype(paddle::experimental::CppTypeToDataType<int>::Type());
 }
 
-void InferMetaFromVecValue(const MetaTensor& x,
-                           const std::vector<int64_t>& shape,
-                           MetaTensor* out) {
-  PADDLE_ENFORCE_EQ(!shape.empty(),
-                    true,
-                    phi::errors::InvalidArgument(
-                        "The parameter 'shape' in ReshapeOp must be set. "
-                        "But received 'shape' is empty."));
-  auto x_dims = x.dims();
-  auto out_dims = ValidateShape(shape, x_dims);
-  out->set_dims(out_dims);
-  out->set_dtype(x.dtype());
-  out->set_layout(x.layout());
-  if (x_dims[0] == out_dims[0]) {
-    // Only pass LoD when the first dimension of output and Input(X)
-    // are the same.
-    out->share_lod(x);
+void ModeInferMeta(const MetaTensor& x,
+                   int axis,
+                   bool keepdim,
+                   MetaTensor* out,
+                   MetaTensor* indices) {
+  auto input_dims = x.dims();
+  const int& dim_size = input_dims.size();
+  PADDLE_ENFORCE_EQ(
+      (axis < dim_size) && (axis >= (-1 * dim_size)),
+      true,
+      errors::InvalidArgument(
+          "the axis of ModeOp must be [-%d, %d), but you set axis is %d",
+          dim_size,
+          dim_size,
+          axis));
+  PADDLE_ENFORCE_GE(
+      input_dims.size(),
+      1,
+      errors::InvalidArgument("input of ModeOp must have >= 1d shape"));
+  if (axis < 0) axis += dim_size;
+  std::vector<int64_t> dimvec;
+  for (int64_t i = 0; i < axis; i++) {
+    dimvec.emplace_back(input_dims[i]);
   }
-}
+  if (keepdim) {
+    dimvec.emplace_back(static_cast<int64_t>(1));
+  }
+  for (int64_t i = axis + 1; i < dim_size; i++) {
+    dimvec.emplace_back(input_dims[i]);
+  }
+  DDim dims = phi::make_ddim(dimvec);
+  PADDLE_ENFORCE_GE(input_dims.size(),
+                    1,
+                    errors::InvalidArgument("input shape should >= 1d"));
+  out->set_dims(dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
 
-void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out) {
-  out->set_dims(phi::make_ddim({1}));
-  out->set_dtype(DataType::BOOL);
+  indices->set_dims(dims);
+  indices->share_lod(x);
+  indices->set_dtype(x.dtype());
 }
 
 void MultinomialInferMeta(const MetaTensor& x,
@@ -366,56 +818,298 @@ void MultinomialInferMeta(const MetaTensor& x,
   out->set_dtype(DataType::INT64);
 }
 
-void ReshapeInferMeta(const MetaTensor& x,
-                      const ScalarArray& shape,
-                      MetaTensor* out,
-                      MetaConfig config) {
-  auto& shape_data = shape.GetData();
-  PADDLE_ENFORCE_NOT_NULL(out,
-                          phi::errors::InvalidArgument(
-                              "Output(Out) of ReshapeOp should not be null."));
-  if (!config.is_runtime && shape.FromTensor()) {
-    out->set_dims(phi::make_ddim(shape_data));
-    out->share_lod(x);
-    return;
+void NormInferMeta(const MetaTensor& x,
+                   int axis,
+                   float epsilon,
+                   bool is_test,
+                   MetaTensor* out,
+                   MetaTensor* norm) {
+  auto xdim = x.dims();
+  out->set_dims(x.dims());
+  out->set_dtype(x.dtype());
+
+  if (is_test == false) {
+    if (axis < 0) axis = xdim.size() + axis;
+    xdim[axis] = 1;
+    norm->set_dims(xdim);
+    norm->set_dtype(x.dtype());
   }
-  PADDLE_ENFORCE_GT(shape_data.size(),
-                    0,
-                    phi::errors::InvalidArgument(
-                        "The shape's size in ReshapeOp can't be zero."));
-  InferMetaFromVecValue(x, shape_data, out);
 }
 
-void ReshapeWithXShapeInferMeta(const MetaTensor& x,
-                                const ScalarArray& shape,
-                                MetaTensor* xshape,
-                                MetaTensor* out,
-                                MetaConfig config) {
-  PADDLE_ENFORCE_NOT_NULL(
-      xshape,
+void PadInferMeta(const MetaTensor& input,
+                  const std::vector<int>& paddings,
+                  float pad_value,
+                  MetaTensor* out,
+                  MetaConfig config) {
+  auto x_dim = input.dims();
+  PADDLE_ENFORCE_EQ(
+      static_cast<int>(paddings.size()),
+      x_dim.size() * 2,
       phi::errors::InvalidArgument(
-          "Output(XShape) of ReshapeOp should not be null."));
-  const auto& x_dims = x.dims();
-  std::vector<int64_t> xshape_dims(x_dims.size() + 1);
-  xshape_dims[0] = 0;
-  for (int i = 0; i < x_dims.size(); ++i) {
-    xshape_dims[i + 1] = x_dims[i];
+          "Size of 'paddings' dimension should be equal to 2 * size of "
+          "Input(X)'s dimension, but received (size of 'paddings' dimension "
+          "is) %d vs (2 * size of Input(X)'s dimension is) %d.",
+          static_cast<int>(paddings.size()),
+          x_dim.size() * 2));
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    PADDLE_ENFORCE_GE(paddings[i],
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The element of 'paddings' should >= 0, but "
+                          "received %d for index %d.",
+                          paddings[i],
+                          static_cast<int>(i)));
   }
-  xshape->set_dims(phi::make_ddim(xshape_dims));
-  xshape->share_lod(x);
-  ReshapeInferMeta(x, shape, out, config);
+  std::vector<int64_t> out_dims(x_dim.size());
+  for (int i = 0; i < x_dim.size(); ++i) {
+    if ((!config.is_runtime) && (x_dim[i] == -1)) {
+      out_dims[i] = -1;
+    } else {
+      out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
+    }
+  }
+  out->set_dims(phi::make_ddim(out_dims));
+  if (out_dims[0] == x_dim[0]) {
+    // Only pass LoD when the first dimension is equal between
+    // output and input.
+    out->share_lod(input);
+  }
+  out->set_dtype(input.dtype());
 }
 
-/*  Why not use SumRawInferMeta directly?
-    Because we need make InferMetaFunction's args follow the design of api.yaml
-*/
-void SumInferMeta(const MetaTensor& x,
-                  const std::vector<int64_t>& axis,
-                  DataType dtype,
-                  bool keep_dim,
-                  MetaTensor* out) {
-  bool reduce_all = false;
-  SumRawInferMeta(x, axis, keep_dim, reduce_all, dtype, out);
+void Pad3dInferMeta(const MetaTensor& x,
+                    const ScalarArray& paddings_scalar_array,
+                    const std::string& mode,
+                    float value,
+                    const std::string& data_format,
+                    MetaTensor* out,
+                    MetaConfig config) {
+  auto x_dim = x.dims();
+  PADDLE_ENFORCE_EQ(x_dim.size(),
+                    5,
+                    errors::InvalidArgument(
+                        "The size of Input(X)'s dimension should be equal to "
+                        "5, but received %d. ",
+                        x_dim.size()));
+
+  std::vector<int64_t> out_dims(x_dim.size());
+  out_dims[0] = x_dim[0];
+  if (paddings_scalar_array.FromTensor()) {
+    if (config.is_runtime) {
+      PADDLE_ENFORCE_EQ(
+          paddings_scalar_array.GetData().size(),
+          6,
+          errors::InvalidArgument("Shape of Input(Paddings) should be equal to "
+                                  "[6], but received [%d].",
+                                  paddings_scalar_array.GetData().size()));
+    }
+    out_dims[1] = x_dim[1];
+    out_dims[2] = x_dim[2];
+    out_dims[3] = x_dim[3];
+  } else {
+    auto paddings = paddings_scalar_array.GetData();
+
+    PADDLE_ENFORCE_EQ(
+        paddings.size(),
+        6,
+        errors::InvalidArgument(
+            "Size of paddings should be equal to 6, but received %d.",
+            static_cast<int>(paddings.size())));
+    if (data_format == "NCDHW") {
+      out_dims[1] = x_dim[1];  // channel
+      out_dims[2] = ((!config.is_runtime) && (x_dim[2] < 0))
+                        ? x_dim[2]
+                        : (x_dim[2] + paddings[4] + paddings[5]);  // depth
+
+      out_dims[3] = ((!config.is_runtime) && (x_dim[3] < 0))
+                        ? x_dim[3]
+                        : (x_dim[3] + paddings[2] + paddings[3]);  // height
+
+      out_dims[4] = ((!config.is_runtime) && (x_dim[4] < 0))
+                        ? x_dim[4]
+                        : (x_dim[4] + paddings[0] + paddings[1]);  // width
+    } else {                                                       // NDHWC
+      out_dims[4] = x_dim[4];                                      // channel
+
+      out_dims[1] = ((!config.is_runtime) && (x_dim[1] < 0))
+                        ? x_dim[1]
+                        : (x_dim[1] + paddings[4] + paddings[5]);  // depth
+      out_dims[2] = ((!config.is_runtime) && (x_dim[2] < 0))
+                        ? x_dim[2]
+                        : (x_dim[2] + paddings[2] + paddings[3]);  // height
+      out_dims[3] = ((!config.is_runtime) && (x_dim[3] < 0))
+                        ? x_dim[3]
+                        : (x_dim[3] + paddings[0] + paddings[1]);  // width
+    }
+  }
+
+  out->set_dims(phi::make_ddim(out_dims));
+  out->set_dtype(x.dtype());
+  out->share_lod(x);
+}
+
+void PixelShuffleInferMeta(const MetaTensor& x,
+                           int upscale_factor,
+                           const std::string& data_format,
+                           MetaTensor* out) {
+  auto input_dims = x.dims();
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "Input should be a 4-D tensor of format [N, C, H, W] "
+                        "or [N, H, W, C], but got %u.",
+                        input_dims.size()));
+
+  const bool channel_last = (data_format == "NHWC");
+
+  if (!channel_last) {
+    PADDLE_ENFORCE_EQ(input_dims[1] % (upscale_factor * upscale_factor),
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The square of upscale_factor[%u] should divide the "
+                          "number of channel[%u]",
+                          upscale_factor * upscale_factor,
+                          input_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(input_dims[3] % (upscale_factor * upscale_factor),
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The square of upscale_factor[%u] should divide the "
+                          "number of channel[%u]",
+                          upscale_factor * upscale_factor,
+                          input_dims[3]));
+  }
+  auto output_dims = input_dims;
+  output_dims[0] = input_dims[0];
+  if (!channel_last) {
+    output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
+    output_dims[2] = input_dims[2] * upscale_factor;
+    output_dims[3] = input_dims[3] * upscale_factor;
+  } else {
+    output_dims[1] = input_dims[1] * upscale_factor;
+    output_dims[2] = input_dims[2] * upscale_factor;
+    output_dims[3] = input_dims[3] / (upscale_factor * upscale_factor);
+  }
+  out->set_dtype(x.dtype());
+  out->set_dims(output_dims);
+}
+
+void PoolInferMeta(const MetaTensor& x,
+                   const std::vector<int>& kernel_size,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings,
+                   bool ceil_mode,
+                   bool exclusive,
+                   const std::string& data_format,
+                   const std::string& pooling_type,
+                   bool global_pooling,
+                   bool adaptive,
+                   const std::string& padding_algorithm,
+                   MetaTensor* out,
+                   MetaConfig config) {
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  auto x_dims = x.dims();
+  PADDLE_ENFORCE_EQ(
+      x_dims.size() == 4 || x_dims.size() == 5,
+      true,
+      errors::InvalidArgument(
+          "the input of Op(pool) should be 4-D or 5-D Tensor. But "
+          "received: %u-D Tensor and it's shape is [%s].",
+          x_dims.size(),
+          x_dims));
+
+  PADDLE_ENFORCE_EQ(x_dims.size() - kernel_size_.size(),
+                    2U,
+                    errors::InvalidArgument(
+                        "the dimension of input minus the size of "
+                        "Attr(kernel_size_) must be euqal to 2 in Op(pool). "
+                        "But received: the dimension of input minus the size "
+                        "of Attr(kernel_size_) is %d, the "
+                        "input's dimension is %d, the shape of input "
+                        "is [%s], the Attr(kernel_size_)'s size is %d, the "
+                        "Attr(kernel_size_) is [%s].",
+                        x_dims.size() - kernel_size_.size(),
+                        x_dims.size(),
+                        x_dims,
+                        kernel_size_.size(),
+                        make_ddim(kernel_size_)));
+
+  PADDLE_ENFORCE_EQ(
+      kernel_size_.size(),
+      strides.size(),
+      errors::InvalidArgument(
+          "the size of Attr(kernel_size_) and Attr(strides) in "
+          "Op(pool) must be equal. "
+          "But received: Attr(kernel_size_)'s size is %d, Attr(strides)'s "
+          "size is %d, Attr(kernel_size_) is [%s], Attr(strides)is [%s].",
+          kernel_size_.size(),
+          strides.size(),
+          make_ddim(kernel_size_),
+          make_ddim(strides)));
+
+  // MKL-DNN Kernels are using NCHW order of dims description
+  // so we ignore data_format consideration for MKL-DNN kernel
+  const bool channel_last = (config.is_run_mkldnn_kernel == false) &&
+                            (data_format == "NHWC" || data_format == "NDHWC");
+
+  // update paddings if "SAME" or global_pooling
+  DDim data_dims;
+  if (channel_last) {
+    data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  } else {
+    data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  }
+  funcs::UpdatePadding(&paddings_,
+                       global_pooling,
+                       adaptive,
+                       padding_algorithm,
+                       data_dims,
+                       strides,
+                       kernel_size_);
+
+  if (global_pooling) {
+    funcs::UpdateKernelSize(&kernel_size_, data_dims);
+  }
+
+  std::vector<int64_t> output_shape;
+  if (adaptive) {
+    output_shape.insert(
+        output_shape.end(), kernel_size_.begin(), kernel_size_.end());
+  } else {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      if ((!config.is_runtime) && (data_dims[i] < 0)) {
+        output_shape.push_back(data_dims[i]);
+      } else {
+        output_shape.push_back(funcs::PoolOutputSize(data_dims[i],
+                                                     kernel_size_[i],
+                                                     paddings_[2 * i],
+                                                     paddings_[2 * i + 1],
+                                                     strides[i],
+                                                     ceil_mode));
+      }
+    }
+  }
+
+  // output_N = input_N
+  output_shape.insert(output_shape.begin(), x_dims[0]);
+  // output_C = input_C
+  if (channel_last) {
+    output_shape.push_back(x_dims[x_dims.size() - 1]);
+  } else {
+    output_shape.insert(output_shape.begin() + 1, x_dims[1]);
+  }
+
+  out->set_dims(make_ddim(output_shape));
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
+void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(dtype::ToReal(x.dtype()));
+  out->set_layout(x.layout());
 }
 
 DDim ReduceInferDim(const MetaTensor& x,
@@ -487,56 +1181,162 @@ DDim ReduceInferDim(const MetaTensor& x,
   return out_dim;
 }
 
-void SumRawInferMeta(const MetaTensor& x,
-                     const std::vector<int64_t>& axis,
-                     bool keep_dim,
-                     bool reduce_all,
-                     DataType dtype,
-                     MetaTensor* out) {
-  DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all);
+void ReduceInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     MetaTensor* out) {
+  bool reduce_all = false;
+  ReduceInferMetaBase(x, axis, keep_dim, reduce_all, out);
+}
+
+void ReduceInferMetaBase(const MetaTensor& x,
+                         const std::vector<int64_t>& axis,
+                         bool keep_dim,
+                         bool reduce_all,
+                         MetaTensor* out) {
+  DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all);
+  out->set_dims(out_dim);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+}
+
+void ReshapeInferMeta(const MetaTensor& x,
+                      const ScalarArray& shape,
+                      MetaTensor* out,
+                      MetaConfig config) {
+  auto& shape_data = shape.GetData();
+  PADDLE_ENFORCE_NOT_NULL(out,
+                          phi::errors::InvalidArgument(
+                              "Output(Out) of ReshapeOp should not be null."));
+  if (!config.is_runtime && shape.FromTensor()) {
+    out->set_dims(phi::make_ddim(shape_data));
+    out->share_lod(x);
+    return;
+  }
+  PADDLE_ENFORCE_GT(shape_data.size(),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The shape's size in ReshapeOp can't be zero."));
+  InferMetaFromVecValue(x, shape_data, out);
+}
+
+void ReshapeWithXShapeInferMeta(const MetaTensor& x,
+                                const ScalarArray& shape,
+                                MetaTensor* xshape,
+                                MetaTensor* out,
+                                MetaConfig config) {
+  PADDLE_ENFORCE_NOT_NULL(
+      xshape,
+      phi::errors::InvalidArgument(
+          "Output(XShape) of ReshapeOp should not be null."));
+  const auto& x_dims = x.dims();
+  std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+  xshape_dims[0] = 0;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    xshape_dims[i + 1] = x_dims[i];
+  }
+  xshape->set_dims(phi::make_ddim(xshape_dims));
+  xshape->share_lod(x);
+  ReshapeInferMeta(x, shape, out, config);
+}
+
+void RollInferMeta(const MetaTensor& x,
+                   const ScalarArray& shifts,
+                   const std::vector<int64_t>& axis,
+                   MetaTensor* out) {
+  auto shifts_data = shifts.GetData();
 
-  DataType out_dtype;
-  if (dtype != DataType::UNDEFINED) {
-    out_dtype = dtype;
+  if (axis.size() != 0) {
+    PADDLE_ENFORCE_EQ(
+        axis.size(),
+        shifts_data.size(),
+        phi::errors::InvalidArgument("When dims.size() != 0, dims.size() "
+                                     "should be equal to "
+                                     "shifts.size(). But received "
+                                     "dims.size() = %d, shifts.size() = %d",
+                                     axis.size(),
+                                     shifts_data.size()));
   } else {
-    if (x.dtype() == DataType::BOOL || x.dtype() == DataType::INT32 ||
-        x.dtype() == DataType::INT64) {
-      out_dtype = DataType::INT64;
-    } else {
-      out_dtype = x.dtype();
-    }
+    PADDLE_ENFORCE_EQ(
+        shifts_data.size(),
+        1,
+        phi::errors::InvalidArgument("When dims.size() == 0, shifts.size() "
+                                     "should be equal to 1, But received "
+                                     "shifts.size() = %d",
+                                     shifts_data.size()));
   }
 
-  out->set_dims(out_dim);
-  out->set_dtype(out_dtype);
-  out->set_layout(x.layout());
+  out->set_dims(x.dims());
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
 }
 
-void ReduceInferMetaBase(const MetaTensor& x,
-                         const std::vector<int64_t>& axis,
-                         bool keep_dim,
-                         bool reduce_all,
-                         MetaTensor* out) {
-  DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all);
-  out->set_dims(out_dim);
-  out->set_dtype(x.dtype());
-  out->set_layout(x.layout());
+void SetValueInferMeta(const MetaTensor& x, MetaTensor* out) {
+  auto in_dims = x.dims();
+  PADDLE_ENFORCE_LT(
+      in_dims.size(),
+      7,
+      phi::errors::InvalidArgument(
+          "The rank of input should be less than 7, but received %d.",
+          in_dims.size()));
 }
 
-void ReduceInferMeta(const MetaTensor& x,
-                     const std::vector<int64_t>& axis,
-                     bool keep_dim,
-                     MetaTensor* out) {
-  bool reduce_all = false;
-  ReduceInferMetaBase(x, axis, keep_dim, reduce_all, out);
+void ShapeInferMeta(const MetaTensor& input, MetaTensor* out) {
+  auto in_dim = input.dims();
+  out->set_dims(phi::make_ddim({in_dim.size()}));
+  out->set_dtype(DataType::INT32);
 }
 
-void TransferLayoutInferMeta(const MetaTensor& x,
-                             DataLayout layout,
-                             MetaTensor* out) {
+void ShardIndexInferMeta(const MetaTensor& in,
+                         int index_num,
+                         int nshards,
+                         int shard_id,
+                         int ignore_value,
+                         MetaTensor* out,
+                         MetaConfig config) {
+  auto x_dims = in.dims();
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument("Rank of Input(X) should be at least 2, "
+                                   "but the value given is %d.",
+                                   x_dims.size()));
+  if (config.is_runtime || x_dims[x_dims.size() - 1] > 0) {
+    PADDLE_ENFORCE_EQ(x_dims[x_dims.size() - 1],
+                      1U,
+                      phi::errors::InvalidArgument(
+                          "The last dimension of Input(X) should be 1, "
+                          "but the value given is %d.",
+                          x_dims[x_dims.size() - 1]));
+  }
+
+  out->set_dims(x_dims);
+  out->share_lod(in);
+  out->set_dtype(in.dtype());
+}
+
+void SizeInferMeta(const MetaTensor& input, MetaTensor* out) {
+  out->set_dtype(DataType::INT64);
+  out->set_dims({1});
+}
+
+void SoftmaxInferMeta(const MetaTensor& x, int axis, MetaTensor* out) {
+  auto dim_x = x.dims();
+  auto rank_x = dim_x.size();
+  PADDLE_ENFORCE_GE(axis,
+                    -rank_x,
+                    phi::errors::InvalidArgument(
+                        "Attr(axis) value should be in range [-R, R-1], "
+                        "R is the rank of Input(X)."));
+  PADDLE_ENFORCE_LT(axis,
+                    rank_x,
+                    phi::errors::InvalidArgument(
+                        "Attr(axis) value should be in range [-R, R-1], "
+                        "R is the rank of Input(X)."));
+
   out->set_dims(x.dims());
   out->set_dtype(x.dtype());
-  out->set_layout(layout);
+  out->share_lod(x);
 }
 
 void SplitInferMeta(const MetaTensor& x,
@@ -670,23 +1470,158 @@ void SplitInferMeta(const MetaTensor& x,
   }
 }
 
-void UnbindInferMeta(const MetaTensor& x,
-                     int axis,
-                     std::vector<MetaTensor>* outs) {
-  auto in_dims = x.dims();
-  std::vector<int> out_dim;
-  axis = axis < 0 ? in_dims.size() + axis : axis;
-  for (int i = 0; i < in_dims.size(); ++i) {
-    if (i != axis) out_dim.push_back(in_dims[i]);
+/*  Why not use SumRawInferMeta directly?
+    Because we need make InferMetaFunction's args follow the design of api.yaml
+*/
+void SumInferMeta(const MetaTensor& x,
+                  const std::vector<int64_t>& axis,
+                  DataType dtype,
+                  bool keep_dim,
+                  MetaTensor* out) {
+  bool reduce_all = false;
+  SumRawInferMeta(x, axis, keep_dim, reduce_all, dtype, out);
+}
+
+void SumRawInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     bool reduce_all,
+                     DataType dtype,
+                     MetaTensor* out) {
+  DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all);
+
+  DataType out_dtype;
+  if (dtype != DataType::UNDEFINED) {
+    out_dtype = dtype;
+  } else {
+    if (x.dtype() == DataType::BOOL || x.dtype() == DataType::INT32 ||
+        x.dtype() == DataType::INT64) {
+      out_dtype = DataType::INT64;
+    } else {
+      out_dtype = x.dtype();
+    }
   }
-  auto out_dims = phi::make_ddim(out_dim);
 
-  for (size_t i = 0; i < outs->size(); ++i) {
-    (*outs)[i].set_dtype(x.dtype());
-    (*outs)[i].set_dims(out_dims);
-    (*outs)[i].set_layout(x.layout());
-    (*outs)[i].share_lod(x);
+  out->set_dims(out_dim);
+  out->set_dtype(out_dtype);
+  out->set_layout(x.layout());
+}
+
+void TileInferMeta(const MetaTensor& x,
+                   const ScalarArray& repeat_times,
+                   MetaTensor* out,
+                   MetaConfig config) {
+#define MAX_RANK_SUPPORTED 6
+
+  auto repeat_times_data = repeat_times.GetData();
+  auto x_dims = x.dims();
+  if (repeat_times_data.size() == 0) {
+    repeat_times_data = std::vector<int64_t>(x_dims.size(), -1);
+  }
+
+  PADDLE_ENFORCE_LE(
+      x_dims.size(),
+      MAX_RANK_SUPPORTED,
+      errors::InvalidArgument(
+          "The rank of the input 'x' for tile op "
+          "must not be greater than %d, but the value received is %d.",
+          MAX_RANK_SUPPORTED,
+          x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      repeat_times_data.size(),
+      MAX_RANK_SUPPORTED,
+      errors::InvalidArgument(
+          "The size of the shape of input 'repeat_times' for tile op "
+          "must not be greater than %d, but the value received is %d.",
+          MAX_RANK_SUPPORTED,
+          repeat_times_data.size()));
+  PADDLE_ENFORCE_GE(
+      repeat_times_data.size(),
+      1,
+      errors::InvalidArgument(
+          "The size of the shape of input 'repeat_times' for tile op "
+          "must be positive integers, but the value received is %d.",
+          repeat_times_data.size()));
+
+  auto out_rank =
+      std::max(static_cast<size_t>(x_dims.size()), repeat_times_data.size());
+  std::vector<int64_t> out_shape(out_rank);
+  auto x_dim_vec = phi::vectorize<int>(x_dims);
+  if (x_dim_vec.size() > repeat_times_data.size()) {
+    auto diff = x_dim_vec.size() - repeat_times_data.size();
+    repeat_times_data.insert(repeat_times_data.begin(), diff, -1);
+  } else {
+    auto diff = repeat_times_data.size() - x_dim_vec.size();
+    x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
   }
+  for (size_t i = 0; i < repeat_times_data.size(); ++i) {
+    if (x_dim_vec[i] == -1 || repeat_times_data[i] == -1) {
+      out_shape[i] = -1;
+    } else {
+      PADDLE_ENFORCE_GT(
+          repeat_times_data[i],
+          0,
+          errors::InvalidArgument(
+              "Every element of the input 'repeat_times' for tile op must be "
+              "greater than 0, but the value given is %d.",
+              repeat_times_data[i]));
+      out_shape[i] = x_dim_vec[i] * repeat_times_data[i];
+    }
+  }
+
+  out->set_dims(phi::make_ddim(out_shape));
+  if (out_shape[0] == x_dims[0]) {
+    out->share_lod(x);
+  }
+}
+
+void TopKInferMeta(const MetaTensor& x,
+                   const Scalar& k_scalar,
+                   int axis,
+                   bool largest,
+                   bool sorted,
+                   MetaTensor* out,
+                   MetaTensor* indices,
+                   MetaConfig config) {
+  auto input_dims = x.dims();
+  const int& dim_size = input_dims.size();
+  PADDLE_ENFORCE_EQ(
+      (axis < dim_size) && (axis >= (-1 * dim_size)),
+      true,
+      phi::errors::InvalidArgument(
+          "the axis of topk must be [-%d, %d), but you set axis is %d",
+          dim_size,
+          dim_size,
+          axis));
+
+  if (axis < 0) axis += dim_size;
+
+  int k = k_scalar.to<int>();
+  if (k_scalar.FromTensor()) {
+    k = -1;
+  } else {
+    PADDLE_ENFORCE_EQ(k >= 1,
+                      true,
+                      phi::errors::InvalidArgument(
+                          "the attribute of k in the topk must >= 1 or be a "
+                          "Tensor, but received %d .",
+                          k));
+  }
+
+  PADDLE_ENFORCE_GE(
+      input_dims.size(),
+      1,
+      phi::errors::InvalidArgument("input of topk must have >= 1d shape"));
+
+  phi::DDim dims = input_dims;
+
+  dims[axis] = k;
+  out->set_dims(dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+  indices->set_dims(dims);
+  indices->share_lod(x);
+  indices->set_dtype(DataType::INT64);
 }
 
 void TraceInferMeta(
@@ -740,81 +1675,126 @@ void TraceInferMeta(
     sizes.erase(sizes.begin() + std::min(dim1_, dim2_));
   }
   out->set_dims(phi::make_ddim(sizes));
+  out->set_dtype(x.dtype());
 }
 
-void DiagonalInferMeta(const MetaTensor& input,
-                       int offset,
-                       int axis1,
-                       int axis2,
-                       MetaTensor* out) {
-  auto x_dims = input.dims();
-  int offset_ = offset;
-  int axis1_ = axis1 < 0 ? x_dims.size() + axis1 : axis1;
-  int axis2_ = axis2 < 0 ? x_dims.size() + axis2 : axis2;
+void TransferLayoutInferMeta(const MetaTensor& x,
+                             DataLayout layout,
+                             MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(x.dtype());
+  out->set_layout(layout);
+}
 
-  PADDLE_ENFORCE_GE(
-      x_dims.size(),
-      2,
-      phi::errors::OutOfRange("Input's dim is out of range (expected at "
-                              "least 2 dimensions, but got %ld).",
-                              x_dims.size()));
-  PADDLE_ENFORCE_LT(
-      axis1_,
-      x_dims.size(),
-      phi::errors::OutOfRange(
-          "Attr(axis1) is out of range (expected to be in range of [%ld, "
-          "%ld], but got %ld).",
-          -(x_dims.size()),
-          (x_dims.size() - 1),
-          axis1));
-  PADDLE_ENFORCE_LT(
-      axis2_,
-      x_dims.size(),
-      phi::errors::OutOfRange(
-          "Attr(axis2) is out of range (expected to be in range of [%ld, "
-          "%ld], but got %ld).",
-          -(x_dims.size()),
-          (x_dims.size() - 1),
-          axis2));
-  PADDLE_ENFORCE_NE(
-      axis1_,
-      axis2_,
-      phi::errors::InvalidArgument("The dimensions should not be identical "
-                                   "%d vs %d.",
-                                   axis1,
-                                   axis2));
+void TransposeInferMeta(const MetaTensor& x,
+                        const std::vector<int>& axis,
+                        MetaTensor* out) {
+  auto x_dims = x.dims();
+  size_t x_rank = x_dims.size();
+  size_t axis_size = axis.size();
+
+  PADDLE_ENFORCE_EQ(
+      x_rank,
+      axis_size,
+      errors::InvalidArgument("The input tensor's dimension "
+                              "should be equal to the axis's size. "
+                              "But received input tensor's dimension is %d, "
+                              "axis's size is %d",
+                              x_rank,
+                              axis_size));
+
+  std::vector<int> count(axis_size, 0);
+  for (size_t i = 0; i < axis_size; i++) {
+    PADDLE_ENFORCE_GE(
+        axis[i],
+        0,
+        errors::InvalidArgument("The axis should be greater than or equal to 0."
+                                "But received %d of axis[%d]",
+                                axis[i],
+                                i));
+
+    PADDLE_ENFORCE_EQ(
+        axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1,
+        true,
+        errors::InvalidArgument(
+            "Each element of Attribute axis should "
+            "be a unique value range from 0 to (dims - 1), "
+            "where the dims is the axis's size, "
+            "unique value means this axis value can appear only once. "
+            "But received axis[%d] is %d, axis_size is %d, "
+            "count[axis[%d]] is %d",
+            i,
+            axis[i],
+            axis_size,
+            i,
+            count[axis[i]]));
+  }
+
+  phi::DDim out_dims(x_dims);
+  for (size_t i = 0; i < axis_size; ++i) {
+    out_dims[i] = x_dims[axis[i]];
+  }
+
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+}
+
+void TransposeGradInferMeta(const MetaTensor& x,
+                            const std::vector<int>& axis,
+                            MetaTensor* out) {
+  std::vector<int> reversed_axis(axis);
+  for (size_t i = 0; i < axis.size(); i++) {
+    reversed_axis[axis[i]] = i;
+  }
 
-  auto out_dims = vectorize(x_dims);
-  // from out_dims get the dim size of axis1_.
-  auto axis1_size = out_dims[axis1_];
-  auto axis2_size = out_dims[axis2_];
-  // delete two dims by attr axis1 and axis2 from out_dims.
-  /* example:
-     out_dim = [2, 3, 4];
-     axis1 = 0;
-     axis2 = 1;
-     according to the attr of axis1 and axis2, we get:
-     out_dim = [4].
-  */
-  out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_));
-  out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_));
+  TransposeInferMeta(x, reversed_axis, out);
+}
 
-  if (offset_ == 0) {
-    out_dims.push_back(std::min(axis1_size, axis2_size));
-  } else if (offset_ > 0) {
-    if ((axis2_size - offset_) > 0) {
-      out_dims.push_back(std::min(axis1_size, axis2_size - offset_));
-    } else {
-      out_dims.push_back(0);
-    }
-  } else {
-    if ((axis1_size + offset_) > 0) {
-      out_dims.push_back(std::min(axis1_size + offset_, axis2_size));
-    } else {
-      out_dims.push_back(0);
-    }
+void UnbindInferMeta(const MetaTensor& x,
+                     int axis,
+                     std::vector<MetaTensor>* outs) {
+  auto in_dims = x.dims();
+  std::vector<int> out_dim;
+  axis = axis < 0 ? in_dims.size() + axis : axis;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    if (i != axis) out_dim.push_back(in_dims[i]);
   }
-  out->set_dims(phi::make_ddim(out_dims));
+  auto out_dims = phi::make_ddim(out_dim);
+
+  for (size_t i = 0; i < outs->size(); ++i) {
+    (*outs)[i].set_dtype(x.dtype());
+    (*outs)[i].set_dims(out_dims);
+    (*outs)[i].set_layout(x.layout());
+    (*outs)[i].share_lod(x);
+  }
+}
+
+void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->share_meta(x);
+}
+
+// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1]
+void UnchangedInferMetaCheckAxis(const MetaTensor& x,
+                                 int axis,
+                                 MetaTensor* out) {
+  auto rank = x.dims().size();
+  PADDLE_ENFORCE_GE(
+      axis,
+      -rank,
+      phi::errors::InvalidArgument(
+          "Attr(axis) value should be in range [-R, R-1], "
+          "R is the rank of Input(X). But received axis: %d, R: %d.",
+          axis,
+          rank));
+  PADDLE_ENFORCE_LT(
+      axis,
+      rank,
+      phi::errors::InvalidArgument(
+          "Attr(axis) value should be in range [-R, R-1], "
+          "R is the rank of Input(X). But received axis: %d, R: %d.",
+          axis,
+          rank));
+  out->share_meta(x);
 }
 
 void UnfoldInferMeta(const MetaTensor& x,
@@ -975,184 +1955,53 @@ void UnfoldInferMeta(const MetaTensor& x,
   out->set_dims(phi::make_ddim(out_dims));
 }
 
-void DiagInferMeta(const MetaTensor& x,
-                   int offset,
-                   float padding_value,
-                   MetaTensor* out) {
+void OneHotRawInferMeta(const MetaTensor& x,
+                        int32_t depth,
+                        DataType dtype,
+                        bool allow_out_of_range,
+                        MetaTensor* out) {
   auto x_dims = x.dims();
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      1,
+      phi::errors::InvalidArgument("Rank of Input(X) should be at least 1."));
 
-  if (x_dims.size() == 1UL) {
-    int64_t size_ = x_dims[0] + std::abs(offset);
-    out->set_dims({size_, size_});
-    out->set_dtype(x.dtype());
-  } else if (x_dims.size() == 2UL) {
-    int64_t size_ = 0;
-    if (offset >= 0) {
-      // Note(LutaoChu): Do not use std::min here, otherwise the calculation
-      // of `size_` will have unexpected result on Windows Python3.8
-      if (x_dims[0] < x_dims[1] - offset) {
-        size_ = x_dims[0];
-      } else {
-        size_ = x_dims[1] - offset;
-      }
-    } else {
-      // Note(LutaoChu): Do not use std::min here, otherwise the calculation
-      // of `size_` will have unexpected result on Windows Python3.8
-      if (x_dims[0] + offset < x_dims[1]) {
-        size_ = x_dims[0] + offset;
-      } else {
-        size_ = x_dims[1];
-      }
-    }
-    out->set_dims({size_});
-    out->set_dtype(x.dtype());
-  } else {
-    PADDLE_THROW(phi::errors::InvalidArgument(
-        "The input tensor X's dimensions of DiagV2Op should be either 1 or "
-        "2, but received %d.",
-        x_dims.size()));
-  }
-}
-
-void SizeInferMeta(const MetaTensor& input, MetaTensor* out) {
-  out->set_dtype(DataType::INT64);
-  out->set_dims({1});
-}
-
-void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) {
-  out->set_dims(x.dims());
-  out->set_dtype(DataType::BOOL);
-}
-
-void PixelShuffleInferMeta(const MetaTensor& x,
-                           int upscale_factor,
-                           const std::string& data_format,
-                           MetaTensor* out) {
-  auto input_dims = x.dims();
-  PADDLE_ENFORCE_EQ(input_dims.size(),
-                    4,
-                    phi::errors::InvalidArgument(
-                        "Input should be a 4-D tensor of format [N, C, H, W] "
-                        "or [N, H, W, C], but got %u.",
-                        input_dims.size()));
-
-  const bool channel_last = (data_format == "NHWC");
-
-  if (!channel_last) {
-    PADDLE_ENFORCE_EQ(input_dims[1] % (upscale_factor * upscale_factor),
-                      0,
-                      phi::errors::InvalidArgument(
-                          "The square of upscale_factor[%u] should divide the "
-                          "number of channel[%u]",
-                          upscale_factor * upscale_factor,
-                          input_dims[1]));
-  } else {
-    PADDLE_ENFORCE_EQ(input_dims[3] % (upscale_factor * upscale_factor),
-                      0,
-                      phi::errors::InvalidArgument(
-                          "The square of upscale_factor[%u] should divide the "
-                          "number of channel[%u]",
-                          upscale_factor * upscale_factor,
-                          input_dims[3]));
-  }
-  auto output_dims = input_dims;
-  output_dims[0] = input_dims[0];
-  if (!channel_last) {
-    output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
-    output_dims[2] = input_dims[2] * upscale_factor;
-    output_dims[3] = input_dims[3] * upscale_factor;
-  } else {
-    output_dims[1] = input_dims[1] * upscale_factor;
-    output_dims[2] = input_dims[2] * upscale_factor;
-    output_dims[3] = input_dims[3] / (upscale_factor * upscale_factor);
-  }
-  out->set_dtype(x.dtype());
-  out->set_dims(output_dims);
+  auto out_dims_vec = phi::vectorize(x_dims);
+  out_dims_vec.push_back(depth);
+  auto out_dims = phi::make_ddim(out_dims_vec);
+  out->set_dims(out_dims);
+  out->share_lod(x);
+  out->set_dtype(dtype);
 }
 
-void TransposeInferMeta(const MetaTensor& x,
-                        const std::vector<int>& axis,
-                        MetaTensor* out) {
+void OneHotInferMeta(const MetaTensor& x,
+                     const Scalar& depth_t,
+                     MetaTensor* out) {
   auto x_dims = x.dims();
-  size_t x_rank = x_dims.size();
-  size_t axis_size = axis.size();
-
-  PADDLE_ENFORCE_EQ(
-      x_rank,
-      axis_size,
-      errors::InvalidArgument("The input tensor's dimension "
-                              "should be equal to the axis's size. "
-                              "But received input tensor's dimension is %d, "
-                              "axis's size is %d",
-                              x_rank,
-                              axis_size));
-
-  std::vector<int> count(axis_size, 0);
-  for (size_t i = 0; i < axis_size; i++) {
-    PADDLE_ENFORCE_GE(
-        axis[i],
-        0,
-        errors::InvalidArgument("The axis should be greater than or equal to 0."
-                                "But received %d of axis[%d]",
-                                axis[i],
-                                i));
-
-    PADDLE_ENFORCE_EQ(
-        axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1,
-        true,
-        errors::InvalidArgument(
-            "Each element of Attribute axis should "
-            "be a unique value range from 0 to (dims - 1), "
-            "where the dims is the axis's size, "
-            "unique value means this axis value can appear only once. "
-            "But received axis[%d] is %d, axis_size is %d, "
-            "count[axis[%d]] is %d",
-            i,
-            axis[i],
-            axis_size,
-            i,
-            count[axis[i]]));
-  }
-
-  phi::DDim out_dims(x_dims);
-  for (size_t i = 0; i < axis_size; ++i) {
-    out_dims[i] = x_dims[axis[i]];
-  }
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      1,
+      phi::errors::InvalidArgument("Rank of Input(X) should be at least 1."));
 
+  int depth = depth_t.to<int>();
+  auto out_dims_vec = phi::vectorize(x_dims);
+  out_dims_vec.push_back(depth);
+  auto out_dims = phi::make_ddim(out_dims_vec);
   out->set_dims(out_dims);
-  out->set_dtype(x.dtype());
-}
+  out->share_lod(x);
 
-void EighInferMeta(const MetaTensor& x,
-                   const std::string& uplo,
-                   MetaTensor* out_w,
-                   MetaTensor* out_v) {
-  auto input_dim = x.dims();
-  auto rank = input_dim.size();
+  out->set_dtype(phi::DataType::FLOAT32);
+}
 
-  PADDLE_ENFORCE_GE(rank,
-                    2,
-                    phi::errors::InvalidArgument(
-                        "The Input(X) should have at least 2 dimensions."
-                        "But received a %d dimension tensor.",
-                        rank));
-  PADDLE_ENFORCE_EQ(
-      input_dim[rank - 2],
-      input_dim[rank - 1],
+void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out) {
+  auto rank = condition.dims().size();
+  PADDLE_ENFORCE_GE(
+      rank,
+      1UL,
       phi::errors::InvalidArgument(
-          "Eigh op is designed for square matrix, consequently"
-          "inner-most 2 dimensions of Input(X) should be symmetric."
-          "But received X's shape[-2] = %d and shape[-1] = %d.",
-          input_dim[rank - 2],
-          input_dim[rank - 1]));
-
-  std::vector<int64_t> values_dim;
-
-  for (auto i = 0; i < rank - 1; i++) {
-    values_dim.emplace_back(input_dim[i]);
-  }
-  out_w->set_dims(phi::make_ddim(values_dim));
-  out_v->set_dims(input_dim);
+          "Input(Condition) should have number of dimension at least 1"));
+  out->set_dims(phi::make_ddim({-1, rank}));
+  out->set_dtype(DataType::INT64);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index c57e1bdec8da8..d84283a65c4d1 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -31,26 +31,22 @@ class MetaConfig;
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 // Because functions in this file not only can infer shape, but also need
 // infer lod or other useful data.
-
-void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out);
-
-// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1]
-void UnchangedInferMetaCheckAxis(const MetaTensor& x,
-                                 int axis,
-                                 MetaTensor* out);
-
-void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out);
-
-void FlattenInferMeta(const MetaTensor& x,
-                      int start_axis,
-                      int stop_axis,
-                      MetaTensor* out);
-
-void GumbelSoftmaxInferMeta(const MetaTensor& x,
-                            float temperature,
-                            bool hard,
-                            int axis,
-                            MetaTensor* out);
+//
+// The InferMeta Functions in this file are arranged in alphabetic order.
+
+void ArgMinMaxInferMeta(const MetaTensor& x,
+                        int64_t axis,
+                        bool keepdims,
+                        bool flatten,
+                        int dtype,
+                        MetaTensor* out,
+                        MetaConfig config = MetaConfig());
+
+void ArgsortInferMeta(const MetaTensor& input,
+                      int axis,
+                      bool descending,
+                      MetaTensor* output,
+                      MetaTensor* indices);
 
 void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out);
 
@@ -70,6 +66,34 @@ void CumsumInferMeta(const MetaTensor& x,
                      bool reverse,
                      MetaTensor* out);
 
+void DiagInferMeta(const MetaTensor& x,
+                   int offset,
+                   float padding_value,
+                   MetaTensor* out);
+
+void DiagonalInferMeta(
+    const MetaTensor& input, int offset, int axis1, int axis2, MetaTensor* out);
+
+void DropoutInferMeta(const MetaTensor& x, MetaTensor* out, MetaTensor* mask);
+
+void EighInferMeta(const MetaTensor& x,
+                   const std::string& uplo,
+                   MetaTensor* out_w,
+                   MetaTensor* out_v);
+
+void FlattenInferMeta(const MetaTensor& x,
+                      int start_axis,
+                      int stop_axis,
+                      MetaTensor* out);
+
+void GumbelSoftmaxInferMeta(const MetaTensor& x,
+                            float temperature,
+                            bool hard,
+                            int axis,
+                            MetaTensor* out);
+void HistogramInferMeta(
+    const MetaTensor& input, int64_t bins, int min, int max, MetaTensor* out);
+
 void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out);
 
 void InferMetaFromVecValue(const MetaTensor& x,
@@ -78,10 +102,90 @@ void InferMetaFromVecValue(const MetaTensor& x,
 
 void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out);
 
+void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out);
+
+void KthvalueInferMeta(const MetaTensor& x,
+                       int k,
+                       int axis,
+                       bool keepdim,
+                       MetaTensor* out,
+                       MetaTensor* indices,
+                       MetaConfig = MetaConfig());
+
+void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out);
+
+void MaxPoolWithIndexInferMeta(const MetaTensor& x,
+                               const std::vector<int>& kernel_size,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& paddings,
+                               bool global_pooling,
+                               bool adaptive,
+                               MetaTensor* out,
+                               MetaTensor* mask,
+                               MetaConfig config = MetaConfig());
+
+void ModeInferMeta(const MetaTensor& x,
+                   int axis,
+                   bool keepdim,
+                   MetaTensor* out,
+                   MetaTensor* indices);
+
 void MultinomialInferMeta(const MetaTensor& x,
                           int num_samples,
                           bool replacement,
                           MetaTensor* out);
+void NormInferMeta(const MetaTensor& x,
+                   int axis,
+                   float epsilon,
+                   bool is_test,
+                   MetaTensor* out,
+                   MetaTensor* norm);
+
+void PadInferMeta(const MetaTensor& input,
+                  const std::vector<int>& paddings,
+                  float pad_value,
+                  MetaTensor* out,
+                  MetaConfig config = MetaConfig());
+
+void Pad3dInferMeta(const MetaTensor& x,
+                    const ScalarArray& paddings,
+                    const std::string& mode,
+                    float value,
+                    const std::string& data_format,
+                    MetaTensor* out,
+                    MetaConfig config = MetaConfig());
+
+void PixelShuffleInferMeta(const MetaTensor& x,
+                           int upscale_factor,
+                           const std::string& data_format,
+                           MetaTensor* out);
+
+void PoolInferMeta(const MetaTensor& x,
+                   const std::vector<int>& kernel_size,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings,
+                   bool ceil_mode,
+                   bool exclusive,
+                   const std::string& data_format,
+                   const std::string& pooling_type,
+                   bool global_pooling,
+                   bool adaptive,
+                   const std::string& padding_algorithm,
+                   MetaTensor* out,
+                   MetaConfig config = MetaConfig());
+
+void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out);
+
+void ReduceInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     MetaTensor* out);
+
+void ReduceInferMetaBase(const MetaTensor& x,
+                         const std::vector<int64_t>& axis,
+                         bool keep_dim,
+                         bool reduce_all,
+                         MetaTensor* out);
 
 void ReshapeInferMeta(const MetaTensor& x,
                       const ScalarArray& shape,
@@ -94,6 +198,39 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
                                 MetaTensor* out,
                                 MetaConfig config = MetaConfig());
 
+void RollInferMeta(const MetaTensor& x,
+                   const ScalarArray& shifts,
+                   const std::vector<int64_t>& axis,
+                   MetaTensor* out);
+
+void SetValueInferMeta(const MetaTensor& x, MetaTensor* out);
+
+void ShapeInferMeta(const MetaTensor& input, MetaTensor* out);
+
+void ShardIndexInferMeta(const MetaTensor& in,
+                         int index_num,
+                         int nshards,
+                         int shard_id,
+                         int ignore_value,
+                         MetaTensor* out,
+                         MetaConfig config = MetaConfig());
+
+void SizeInferMeta(const MetaTensor& input, MetaTensor* out);
+
+void SoftmaxInferMeta(const MetaTensor& x, int axis, MetaTensor* out);
+
+void SplitInferMeta(const MetaTensor& x_meta,
+                    const ScalarArray& num_or_sections,
+                    const Scalar& axis,
+                    std::vector<MetaTensor*> out,
+                    MetaConfig config = MetaConfig());
+
+void SumInferMeta(const MetaTensor& x,
+                  const std::vector<int64_t>& axis,
+                  DataType dtype,
+                  bool keep_dim,
+                  MetaTensor* out);
+
 void SumRawInferMeta(const MetaTensor& x,
                      const std::vector<int64_t>& axis,
                      bool keep_dim,
@@ -101,38 +238,45 @@ void SumRawInferMeta(const MetaTensor& x,
                      DataType dtype,
                      MetaTensor* out);
 
-void ReduceInferMetaBase(const MetaTensor& x,
-                         const std::vector<int64_t>& axis,
-                         bool keep_dim,
-                         bool reduce_all,
-                         MetaTensor* out);
+void TileInferMeta(const MetaTensor& x,
+                   const ScalarArray& repeat_times,
+                   MetaTensor* out,
+                   MetaConfig config = MetaConfig());
 
-void ReduceInferMeta(const MetaTensor& x,
-                     const std::vector<int64_t>& axis,
-                     bool keep_dim,
-                     MetaTensor* out);
+void TopKInferMeta(const MetaTensor& x,
+                   const Scalar& k_scalar,
+                   int axis,
+                   bool largest,
+                   bool sorted,
+                   MetaTensor* out,
+                   MetaTensor* indices,
+                   MetaConfig config = MetaConfig());
 
-void SumInferMeta(const MetaTensor& x,
-                  const std::vector<int64_t>& axis,
-                  DataType dtype,
-                  bool keep_dim,
-                  MetaTensor* out);
+void TraceInferMeta(
+    const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out);
 
 void TransferLayoutInferMeta(const MetaTensor& x,
                              DataLayout layout,
                              MetaTensor* out);
 
-void SplitInferMeta(const MetaTensor& x_meta,
-                    const ScalarArray& num_or_sections,
-                    const Scalar& axis,
-                    std::vector<MetaTensor*> out,
-                    MetaConfig config = MetaConfig());
+void TransposeInferMeta(const MetaTensor& x,
+                        const std::vector<int>& axis,
+                        MetaTensor* out);
+
+void TransposeGradInferMeta(const MetaTensor& x,
+                            const std::vector<int>& axis,
+                            MetaTensor* out);
 
 void UnbindInferMeta(const MetaTensor& x,
                      int axis,
                      std::vector<MetaTensor>* outs);
-void TraceInferMeta(
-    const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out);
+
+void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out);
+
+// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1]
+void UnchangedInferMetaCheckAxis(const MetaTensor& x,
+                                 int axis,
+                                 MetaTensor* out);
 
 void UnfoldInferMeta(const MetaTensor& x,
                      const std::vector<int>& kernel_sizes,
@@ -142,30 +286,14 @@ void UnfoldInferMeta(const MetaTensor& x,
                      MetaTensor* out,
                      MetaConfig config = MetaConfig());
 
-void DiagInferMeta(const MetaTensor& x,
-                   int offset,
-                   float padding_value,
-                   MetaTensor* out);
-
-void SizeInferMeta(const MetaTensor& input, MetaTensor* out);
-
-void DiagonalInferMeta(
-    const MetaTensor& input, int offset, int axis1, int axis2, MetaTensor* out);
-
-void PixelShuffleInferMeta(const MetaTensor& x,
-                           int upscale_factor,
-                           const std::string& data_format,
-                           MetaTensor* out);
-
-void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out);
-
-void TransposeInferMeta(const MetaTensor& x,
-                        const std::vector<int>& axis,
+void OneHotRawInferMeta(const MetaTensor& x,
+                        int32_t depth,
+                        DataType dtype,
+                        bool allow_out_of_range,
                         MetaTensor* out);
 
-void EighInferMeta(const MetaTensor& x,
-                   const std::string& uplo,
-                   MetaTensor* out_w,
-                   MetaTensor* out_v);
+void OneHotInferMeta(const MetaTensor& x, const Scalar& depth, MetaTensor* out);
+
+void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 71e0d9e3479e6..d140912aa7830 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -11,7 +11,7 @@ set_property(GLOBAL PROPERTY PHI_KERNELS "")
 
 # [ 1. Common kernel compilation dependencies ]
 set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor softmax)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor)
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
@@ -27,18 +27,33 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
 # Some kernels depend on some targets that are not commonly used.
 # These targets are not suitable for common dependencies.
 # In this case, you need to manually generate them here.
-set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel maxout_kernel maxout_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel eigh_kernel)
-kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel)
-kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
-kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
-kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce)
+set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel
+    hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel
+    matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel
+    put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel
+    softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel
+    triangular_solve_grad_kernel determinant_grad_kernel reduce_kernel)
+kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function)
+kernel_library(hierarchical_sigmoid_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code)
+kernel_library(hierarchical_sigmoid_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code)
+kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
+kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
+kernel_library(reduce_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel)
+kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
+kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
 kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting)
 kernel_library(maxout_grad_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting)
+kernel_library(pool_kernel DEPS ${COMMON_KERNEL_DEPS} pooling)
 kernel_library(put_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
 kernel_library(put_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(segment_pool_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling)
+kernel_library(segment_pool_grad_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling)
+kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
+kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
 kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
-kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function)
+kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce)
+kernel_library(determinant_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
 
 # 4. auto parse and build kernel targets by cmake
 register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} )
diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
index f34e5710ab729..241a80d85ead2 100644
--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -19,37 +19,148 @@ limitations under the License. */
 
 namespace phi {
 
-#define DECLARE_ACTIVATION_GRAD_KERNEL_DepX(name) \
+#define DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(name) \
   template <typename T, typename Context>         \
   void name##GradKernel(const Context& dev_ctx,   \
                         const DenseTensor& x,     \
                         const DenseTensor& dout,  \
                         DenseTensor* dx);
 
-#define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \
+#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(name, attr) \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& x,                   \
+                        const DenseTensor& dout,                \
+                        float attr,                             \
+                        DenseTensor* dx);
+
+#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(name, attr1, attr2) \
+  template <typename T, typename Context>                               \
+  void name##GradKernel(const Context& dev_ctx,                         \
+                        const DenseTensor& x,                           \
+                        const DenseTensor& dout,                        \
+                        float attr1,                                    \
+                        float attr2,                                    \
+                        DenseTensor* dx);
+
+#define DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(name) \
   template <typename T, typename Context>           \
   void name##GradKernel(const Context& dev_ctx,     \
                         const DenseTensor& out,     \
                         const DenseTensor& dout,    \
                         DenseTensor* dx);
 
+#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(name, attr) \
+  template <typename T, typename Context>                         \
+  void name##GradKernel(const Context& dev_ctx,                   \
+                        const DenseTensor& out,                   \
+                        const DenseTensor& dout,                  \
+                        float attr,                               \
+                        DenseTensor* dx);
+
+#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(name, attr1, attr2) \
+  template <typename T, typename Context>                                 \
+  void name##GradKernel(const Context& dev_ctx,                           \
+                        const DenseTensor& out,                           \
+                        const DenseTensor& dout,                          \
+                        float attr1,                                      \
+                        float attr2,                                      \
+                        DenseTensor* dx);
+
 template <typename T, typename Context>
 void ReluDoubleGradKernel(const Context& dev_ctx,
                           const DenseTensor& out,
                           const DenseTensor& ddx,
                           DenseTensor* ddout);
 
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sin);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asin);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atan);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sinh);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cosh);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu);
+template <typename T, typename Context>
+void TanhDoubleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          const DenseTensor& dout,
+                          DenseTensor* dout_new,
+                          DenseTensor* ddout);
+
+template <typename T, typename Context>
+void TanhTripleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          const DenseTensor& dout,
+                          const DenseTensor& d_ddout,
+                          const DenseTensor& d_dout_new,
+                          DenseTensor* d_out_new,
+                          DenseTensor* d_dout,
+                          DenseTensor* d_ddx);
+
+template <typename T, typename Context>
+void LeakyReluDoubleGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& ddx,
+                               float alpha,
+                               DenseTensor* ddout);
+
+template <typename T, typename Context>
+void EluGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out,
+                   const DenseTensor& dout,
+                   float alpha,
+                   DenseTensor* dx);
+
+template <typename T, typename Context>
+void EluDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& dout,
+                         const DenseTensor& ddx,
+                         float alpha,
+                         DenseTensor* dx,
+                         DenseTensor* ddout);
+
+template <typename T, typename Context>
+void SigmoidDoubleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& ddx,
+                             const DenseTensor& dout,
+                             DenseTensor* dout_new,
+                             DenseTensor* ddout);
+
+template <typename T, typename Context>
+void SigmoidTripleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& ddx,
+                             const DenseTensor& dout,
+                             const DenseTensor& d_ddout,
+                             const DenseTensor& d_dout_new,
+                             DenseTensor* d_out_new,
+                             DenseTensor* d_dout,
+                             DenseTensor* d_ddx);
+
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Cos);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Tan);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Acos);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Sin);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Asin);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Atan);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Sinh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Cosh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Asinh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Acosh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Atanh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Silu);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid);
+
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid);
+
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, alpha);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, threshold);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, lambda);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold);
+
+DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, t_min, t_max);
+
+DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, slope, offset);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h
index bdf8f4363598f..dbc63a636edb1 100644
--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -24,6 +24,21 @@ namespace phi {
   void name##Kernel(                      \
       const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
+#define DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(name, attr) \
+  template <typename T, typename Context>                    \
+  void name##Kernel(const Context& dev_ctx,                  \
+                    const DenseTensor& x,                    \
+                    float attr,                              \
+                    DenseTensor* out);
+
+#define DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(name, attr1, attr2) \
+  template <typename T, typename Context>                            \
+  void name##Kernel(const Context& dev_ctx,                          \
+                    const DenseTensor& x,                            \
+                    float attr1,                                     \
+                    float attr2,                                     \
+                    DenseTensor* out);
+
 DECLARE_ACTIVATION_KERNEL(Cos)
 DECLARE_ACTIVATION_KERNEL(Tan)
 DECLARE_ACTIVATION_KERNEL(Acos)
@@ -36,5 +51,18 @@ DECLARE_ACTIVATION_KERNEL(Asinh)
 DECLARE_ACTIVATION_KERNEL(Acosh)
 DECLARE_ACTIVATION_KERNEL(Atanh)
 DECLARE_ACTIVATION_KERNEL(Relu)
+DECLARE_ACTIVATION_KERNEL(Tanh)
+DECLARE_ACTIVATION_KERNEL(TanhShrink)
+DECLARE_ACTIVATION_KERNEL(Silu)
+DECLARE_ACTIVATION_KERNEL(Sigmoid)
+DECLARE_ACTIVATION_KERNEL(LogSigmoid)
+
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)
 
+DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max)
+DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, slope, offset)
 }  // namespace phi
diff --git a/paddle/phi/kernels/allclose_kernel.h b/paddle/phi/kernels/allclose_kernel.h
new file mode 100644
index 0000000000000..3f24078b86ca1
--- /dev/null
+++ b/paddle/phi/kernels/allclose_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AllCloseKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const Scalar& rtol,
+                    const Scalar& atol,
+                    bool equal_nan,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/arg_min_max_kernel.h b/paddle/phi/kernels/arg_min_max_kernel.h
new file mode 100644
index 0000000000000..917babeef07e9
--- /dev/null
+++ b/paddle/phi/kernels/arg_min_max_kernel.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ArgMinKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void ArgMaxKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/argsort_grad_kernel.h b/paddle/phi/kernels/argsort_grad_kernel.h
new file mode 100644
index 0000000000000..b91bd69911351
--- /dev/null
+++ b/paddle/phi/kernels/argsort_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ArgsortGradKernel(const Context& dev_ctx,
+                       const DenseTensor& indices,
+                       const DenseTensor& input,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       bool descending,
+                       DenseTensor* in_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/argsort_kernel.h b/paddle/phi/kernels/argsort_kernel.h
new file mode 100644
index 0000000000000..683e8631d2e34
--- /dev/null
+++ b/paddle/phi/kernels/argsort_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ArgsortKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   int axis,
+                   bool descending,
+                   DenseTensor* output,
+                   DenseTensor* indices);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
new file mode 100644
index 0000000000000..9faaace691766
--- /dev/null
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/assign_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename Context>
+void AssignKernel(const Context& dev_ctx,
+                  paddle::optional<const DenseTensor&> x,
+                  DenseTensor* out) {
+  if (!x.is_initialized()) {
+    return;
+  }
+  auto& x_tensor = *x.get_ptr();
+  Copy<Context>(dev_ctx, x_tensor, x_tensor.place(), false, out);
+}
+
+// Note: use `const paddle::optional<std::vector<const DenseTensor*>&> x`
+// as input if needed
+template <typename Context>
+void AssignArrayKernel(const Context& dev_ctx,
+                       const std::vector<const DenseTensor*>& x,
+                       std::vector<DenseTensor*> out) {
+  for (size_t i = 0; i < x.size(); ++i) {
+    AssignKernel<Context>(dev_ctx, *x[i], out.at(i));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_GENERAL_KERNEL(
+    assign, CPU, ALL_LAYOUT, phi::AssignKernel<phi::CPUContext>, ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(assign_array,
+                           CPU,
+                           ALL_LAYOUT,
+                           phi::AssignArrayKernel<phi::CPUContext>,
+                           ALL_DTYPE) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(
+    assign, GPU, ALL_LAYOUT, phi::AssignKernel<phi::GPUContext>, ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(assign_array,
+                           GPU,
+                           ALL_LAYOUT,
+                           phi::AssignArrayKernel<phi::GPUContext>,
+                           ALL_DTYPE) {}
+#endif
diff --git a/paddle/phi/kernels/assign_kernel.h b/paddle/phi/kernels/assign_kernel.h
new file mode 100644
index 0000000000000..7cc06818dc007
--- /dev/null
+++ b/paddle/phi/kernels/assign_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+// In order to be compatible with the `AsDispensable` input in the original
+// assign op maker, the input parameter here needs to be dispensable, but
+// this looks weird
+template <typename Context>
+void AssignKernel(const Context& dev_ctx,
+                  paddle::optional<const DenseTensor&> x,
+                  DenseTensor* out);
+
+template <typename Context>
+void AssignArrayKernel(const Context& dev_ctx,
+                       const std::vector<const DenseTensor*>& x,
+                       std::vector<DenseTensor*> out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/batch_norm_kernel.cc b/paddle/phi/kernels/batch_norm_kernel.cc
new file mode 100644
index 0000000000000..a0de7842b9e0d
--- /dev/null
+++ b/paddle/phi/kernels/batch_norm_kernel.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BatchNormInferKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& scale,
+                          const DenseTensor& bias,
+                          const DenseTensor& mean,
+                          const DenseTensor& variance,
+                          float momentum,
+                          float epsilon,
+                          const std::string& data_layout,
+                          DenseTensor* y,
+                          DenseTensor* mean_out,
+                          DenseTensor* variance_out) {
+  // Since saved_mean and saved_variance are used regardless of whether
+  // they are in test mode, temporary variables need to be created here
+  // to be compatible
+  auto saved_mean = phi::EmptyLike<T, Context>(dev_ctx, *mean_out);
+  auto saved_variance = phi::EmptyLike<T, Context>(dev_ctx, *variance_out);
+  BatchNormKernel<T, Context>(dev_ctx,
+                              x,
+                              scale,
+                              bias,
+                              mean,
+                              variance,
+                              momentum,
+                              epsilon,
+                              data_layout,
+                              /*is_test=*/true,
+                              /*use_global_stats=*/false,
+                              /*trainable_statistics=*/false,
+                              /*fuse_with_relu=*/false,
+                              y,
+                              mean_out,
+                              variance_out,
+                              &saved_mean,
+                              &saved_variance,
+                              /*reserve_space=*/nullptr);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(batch_norm_infer,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormInferKernel,
+                   float,
+                   double) {}
+#ifdef PADDLE_WITH_CUDA
+PD_REGISTER_KERNEL(batch_norm_infer,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormInferKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+#endif
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(batch_norm_infer,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormInferKernel,
+                   float,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/batch_norm_kernel.h b/paddle/phi/kernels/batch_norm_kernel.h
index 7ddf32e27c7d7..be589e43647c1 100644
--- a/paddle/phi/kernels/batch_norm_kernel.h
+++ b/paddle/phi/kernels/batch_norm_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
@@ -40,4 +41,18 @@ void BatchNormKernel(const Context& dev_ctx,
                      DenseTensor* saved_variance,
                      DenseTensor* reserve_space);
 
+template <typename T, typename Context>
+void BatchNormInferKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& scale,
+                          const DenseTensor& bias,
+                          const DenseTensor& mean,
+                          const DenseTensor& variance,
+                          float momentum,
+                          float epsilon,
+                          const std::string& data_layout,
+                          DenseTensor* y,
+                          DenseTensor* mean_out,
+                          DenseTensor* variance_out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/cholesky_solve_grad_kernel.h b/paddle/phi/kernels/cholesky_solve_grad_kernel.h
new file mode 100644
index 0000000000000..e2ce67abae623
--- /dev/null
+++ b/paddle/phi/kernels/cholesky_solve_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskySolveGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& y,
+                             const DenseTensor& out,
+                             const DenseTensor& dout,
+                             bool upper,
+                             DenseTensor* dx,
+                             DenseTensor* dy);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cholesky_solve_kernel.h b/paddle/phi/kernels/cholesky_solve_kernel.h
new file mode 100644
index 0000000000000..b304a20e611d1
--- /dev/null
+++ b/paddle/phi/kernels/cholesky_solve_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskySolveKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& y,
+                         bool upper,
+                         DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/concat_kernel.h b/paddle/phi/kernels/concat_kernel.h
index 4e72159aeca67..cf83ab9aaabe1 100644
--- a/paddle/phi/kernels/concat_kernel.h
+++ b/paddle/phi/kernels/concat_kernel.h
@@ -40,7 +40,7 @@ DenseTensor Concat(const Context& dev_ctx,
 
   DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
-  ConcatInferMeta(meta_x_ptr, axis.to<int>(), &meta_out, /*is_runtime=*/true);
+  ConcatInferMeta(meta_x_ptr, axis.to<int>(), &meta_out);
   ConcatKernel<T, Context>(dev_ctx, x, axis, &dense_out);
   return dense_out;
 }
diff --git a/paddle/phi/kernels/conv_transpose_grad_kernel.h b/paddle/phi/kernels/conv_transpose_grad_kernel.h
new file mode 100644
index 0000000000000..2b1c0c1a934cf
--- /dev/null
+++ b/paddle/phi/kernels/conv_transpose_grad_kernel.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Conv2dTransposeGradKernel(const Context& ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& filter,
+                               const DenseTensor& dout,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& paddings,
+                               const std::vector<int>& output_padding,
+                               const std::vector<int>& output_size,
+                               const std::string& padding_algorithm,
+                               int groups,
+                               const std::vector<int>& dilations,
+                               const std::string& data_format,
+                               DenseTensor* dx,
+                               DenseTensor* dfilter);
+
+template <typename T, typename Context>
+void Conv2dTransposeDoubleGradKernel(const Context& ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& filter,
+                                     const DenseTensor& dout,
+                                     const DenseTensor& ddx,
+                                     const DenseTensor& ddfilter,
+                                     const std::vector<int>& strides,
+                                     const std::vector<int>& paddings,
+                                     const std::vector<int>& output_padding,
+                                     const std::vector<int>& output_size,
+                                     const std::string& padding_algorithm,
+                                     int groups,
+                                     const std::vector<int>& dilations,
+                                     const std::string& data_format,
+                                     DenseTensor* dx,
+                                     DenseTensor* dfilter,
+                                     DenseTensor* ddout);
+
+template <typename T, typename Context>
+void Conv3dTransposeGradKernel(const Context& ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& filter,
+                               const DenseTensor& dout,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& paddings,
+                               const std::vector<int>& output_padding,
+                               const std::vector<int>& output_size,
+                               const std::string& padding_algorithm,
+                               int groups,
+                               const std::vector<int>& dilations,
+                               const std::string& data_format,
+                               DenseTensor* dx,
+                               DenseTensor* dfilter);
+
+template <typename T, typename Context>
+void DepthwiseConv2dTransposeGradKernel(const Context& ctx,
+                                        const DenseTensor& x,
+                                        const DenseTensor& filter,
+                                        const DenseTensor& dout,
+                                        const std::vector<int>& strides,
+                                        const std::vector<int>& paddings,
+                                        const std::vector<int>& output_padding,
+                                        const std::vector<int>& output_size,
+                                        const std::string& padding_algorithm,
+                                        int groups,
+                                        const std::vector<int>& dilations,
+                                        const std::string& data_format,
+                                        DenseTensor* dx,
+                                        DenseTensor* dfilter);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/conv_transpose_kernel.h b/paddle/phi/kernels/conv_transpose_kernel.h
new file mode 100644
index 0000000000000..de56f13ddf73e
--- /dev/null
+++ b/paddle/phi/kernels/conv_transpose_kernel.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Conv2dTransposeKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& filter,
+                           const std::vector<int>& strides,
+                           const std::vector<int>& paddings,
+                           const std::vector<int>& output_padding,
+                           const std::vector<int>& output_size,
+                           const std::string& padding_algorithm,
+                           int groups,
+                           const std::vector<int>& dilations,
+                           const std::string& data_format,
+                           DenseTensor* out);
+
+template <typename T, typename Context>
+void Conv3dTransposeKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& filter,
+                           const std::vector<int>& strides,
+                           const std::vector<int>& paddings,
+                           const std::vector<int>& output_padding,
+                           const std::vector<int>& output_size,
+                           const std::string& padding_algorithm,
+                           int groups,
+                           const std::vector<int>& dilations,
+                           const std::string& data_format,
+                           DenseTensor* out);
+
+template <typename T, typename Context>
+void DepthwiseConv2dTransposeKernel(const Context& ctx,
+                                    const DenseTensor& x,
+                                    const DenseTensor& filter,
+                                    const std::vector<int>& strides,
+                                    const std::vector<int>& paddings,
+                                    const std::vector<int>& output_padding,
+                                    const std::vector<int>& output_size,
+                                    const std::string& padding_algorithm,
+                                    int groups,
+                                    const std::vector<int>& dilations,
+                                    const std::string& data_format,
+                                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
index fe43ebb816077..c582261596221 100644
--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -21,71 +21,215 @@ limitations under the License. */
 
 namespace phi {
 
-#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \
+#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \
   template <typename T, typename Context>                           \
   void name##GradKernel(const Context& dev_ctx,                     \
                         const DenseTensor& x,                       \
                         const DenseTensor& dout,                    \
                         DenseTensor* dx) {                          \
-    functor_class functor;                                          \
-    ActivationGradImpl<T, Context, functor_class>(                  \
+    funcs::functor_class<T> functor;                                \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>(        \
         dev_ctx, &x, nullptr, &dout, dx, functor);                  \
   }
 
-#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(      \
+    name, functor_class, attr)                               \
+  template <typename T, typename Context>                    \
+  void name##GradKernel(const Context& dev_ctx,              \
+                        const DenseTensor& x,                \
+                        const DenseTensor& dout,             \
+                        float attr,                          \
+                        DenseTensor* dx) {                   \
+    funcs::functor_class<T> functor;                         \
+    auto attrs = functor.GetAttrs();                         \
+    *(attrs[0].second) = attr;                               \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);           \
+  }
+
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(      \
+    name, functor_class, attr1, attr2)                       \
+  template <typename T, typename Context>                    \
+  void name##GradKernel(const Context& dev_ctx,              \
+                        const DenseTensor& x,                \
+                        const DenseTensor& dout,             \
+                        float attr1,                         \
+                        float attr2,                         \
+                        DenseTensor* dx) {                   \
+    funcs::functor_class<T> functor;                         \
+    auto attrs = functor.GetAttrs();                         \
+    *(attrs[0].second) = attr1;                              \
+    *(attrs[1].second) = attr2;                              \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);           \
+  }
+
+#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \
   template <typename T, typename Context>                             \
   void name##GradKernel(const Context& dev_ctx,                       \
                         const DenseTensor& out,                       \
                         const DenseTensor& dout,                      \
                         DenseTensor* dx) {                            \
-    functor_class functor;                                            \
-    ActivationGradImpl<T, Context, functor_class>(                    \
+    funcs::functor_class<T> functor;                                  \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>(          \
         dev_ctx, nullptr, &out, &dout, dx, functor);                  \
   }
 
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor<T>);
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(    \
+    name, functor_class, attr)                               \
+  template <typename T, typename Context>                    \
+  void name##GradKernel(const Context& dev_ctx,              \
+                        const DenseTensor& out,              \
+                        const DenseTensor& dout,             \
+                        float attr,                          \
+                        DenseTensor* dx) {                   \
+    funcs::functor_class<T> functor;                         \
+    auto attrs = functor.GetAttrs();                         \
+    *(attrs[0].second) = attr;                               \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, nullptr, &out, &dout, dx, functor);         \
+  }
+
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(    \
+    name, functor_class, attr1, attr2)                       \
+  template <typename T, typename Context>                    \
+  void name##GradKernel(const Context& dev_ctx,              \
+                        const DenseTensor& out,              \
+                        const DenseTensor& dout,             \
+                        float attr1,                         \
+                        float attr2,                         \
+                        DenseTensor* dx) {                   \
+    funcs::functor_class<T> functor;                         \
+    auto attrs = functor.GetAttrs();                         \
+    *(attrs[0].second) = attr1;                              \
+    *(attrs[1].second) = attr2;                              \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, nullptr, &out, &dout, dx, functor);         \
+  }
+
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CosGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, TanGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, AcosGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Sin, SinGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Asin, AsinGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atan, AtanGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Sinh, SinhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cosh, CoshGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Asinh, AsinhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, AcoshGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, LogSigmoidGradFunctor);
+
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, SigmoidGradFunctor);
+
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
+                                               LeakyReluGradFunctor,
+                                               alpha);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu,
+                                               ThresholdedReluGradFunctor,
+                                               threshold);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
+                                               SoftShrinkGradFunctor,
+                                               lambda);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
+                                               HardShrinkGradFunctor,
+                                               threshold);
+
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
+                                               BReluGradFunctor,
+                                               t_min,
+                                               t_max);
+
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
+                                                 HardSigmoidGradFunctor,
+                                                 slope,
+                                                 offset);
+
+template <typename T, typename Context>
+void EluGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out,
+                   const DenseTensor& dout,
+                   float alpha,
+                   DenseTensor* dx) {
+  dev_ctx.template Alloc<T>(dx);
+
+  auto x_flatten =
+      EigenVector<T>::Flatten(GET_DATA_SAFELY(&x, "Input", "X", "elu_grad"));
+  auto out_flatten = EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(&out, "Input", "Out", "elu_grad"));
+  auto dout_flatten = EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(&dout, "Input", "dOut", "elu_grad"));
+  auto dx_flatten =
+      EigenVector<T>::Flatten(GET_DATA_SAFELY(dx, "Output", "dX", "elu_grad"));
+  auto* place = dev_ctx.eigen_device();
+
+  if (alpha > 0) {
+    funcs::ELUGradFunctor<T> functor;
+    functor.alpha = alpha;
+    functor(*place, x_flatten, out_flatten, dout_flatten, dx_flatten);
+  } else {
+    funcs::ELUGradNegativeAlphaFunctor<T> functor;
+    functor.alpha = alpha;
+    functor(*place, x_flatten, out_flatten, dout_flatten, dx_flatten);
+  }
+}
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    cos_grad, CPU, ALL_LAYOUT, phi::CosGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    tan_grad, CPU, ALL_LAYOUT, phi::TanGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    acos_grad, CPU, ALL_LAYOUT, phi::AcosGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    sin_grad, CPU, ALL_LAYOUT, phi::SinGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    asin_grad, CPU, ALL_LAYOUT, phi::AsinGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    atan_grad, CPU, ALL_LAYOUT, phi::AtanGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    sinh_grad, CPU, ALL_LAYOUT, phi::SinhGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    cosh_grad, CPU, ALL_LAYOUT, phi::CoshGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    asinh_grad, CPU, ALL_LAYOUT, phi::AsinhGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    acosh_grad, CPU, ALL_LAYOUT, phi::AcoshGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    atanh_grad, CPU, ALL_LAYOUT, phi::AtanhGradKernel, float, double) {}
 PD_REGISTER_KERNEL(
     relu_grad, CPU, ALL_LAYOUT, phi::ReluGradKernel, float, double) {}
-PD_REGISTER_KERNEL(relu_double_grad,
+
+#define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func, float, double) {}
+
+#define PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(                                         \
+      name, CPU, ALL_LAYOUT, phi::func, float, double, phi::dtype::float16) {}
+
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sin_grad, SinGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(cos_grad, CosGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tan_grad, TanGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(acos_grad, AcosGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(asin_grad, AsinGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(atan_grad, AtanGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sinh_grad, SinhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(cosh_grad, CoshGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(asinh_grad, AsinhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(acosh_grad, AcoshGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(atanh_grad, AtanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_grad, TanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
+                                   ThresholdedReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel)
+
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad,
+                                          ReluDoubleGradKernel)
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(tanh_double_grad,
+                                          TanhDoubleGradKernel)
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(leaky_relu_double_grad,
+                                          LeakyReluDoubleGradKernel)
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel)
+
+PD_REGISTER_KERNEL(tanh_triple_grad,
                    CPU,
                    ALL_LAYOUT,
-                   phi::ReluDoubleGradKernel,
+                   phi::TanhTripleGradKernel,
                    float,
                    double,
                    phi::dtype::float16) {}
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel)
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
index 51883f25183af..1d7b77ea4445f 100644
--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -19,37 +19,102 @@ limitations under the License. */
 
 namespace phi {
 
-#define DEFINE_CPU_ACTIVATION_KERNEL(name, functor_class)                \
-  template <typename T, typename Context>                                \
-  void name##Kernel(                                                     \
-      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {  \
-    functor_class functor;                                               \
-    ActivationImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \
+#define DEFINE_CPU_ACTIVATION_KERNEL(name, functor_class)               \
+  template <typename T, typename Context>                               \
+  void name##Kernel(                                                    \
+      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \
+    funcs::functor_class<T> functor;                                    \
+    ActivationImpl<T, Context, funcs::functor_class<T>>(                \
+        dev_ctx, x, out, functor);                                      \
   }
 
-DEFINE_CPU_ACTIVATION_KERNEL(Sin, funcs::SinFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Cos, funcs::CosFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Tan, funcs::TanFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Asin, funcs::AsinFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Atan, funcs::AtanFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Acos, funcs::AcosFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Sinh, funcs::SinhFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Cosh, funcs::CoshFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Asinh, funcs::AsinhFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Acosh, funcs::AcoshFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Atanh, funcs::AtanhFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Relu, funcs::ReluCPUFunctor<T>)
+#define DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \
+  template <typename T, typename Context>                               \
+  void name##Kernel(const Context& dev_ctx,                             \
+                    const DenseTensor& x,                               \
+                    float attr,                                         \
+                    DenseTensor* out) {                                 \
+    funcs::functor_class<T> functor;                                    \
+    auto attrs = functor.GetAttrs();                                    \
+    *(attrs[0].second) = attr;                                          \
+    ActivationImpl<T, Context, funcs::functor_class<T>>(                \
+        dev_ctx, x, out, functor);                                      \
+  }
+
+#define DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(            \
+    name, functor_class, attr1, attr2)                   \
+  template <typename T, typename Context>                \
+  void name##Kernel(const Context& dev_ctx,              \
+                    const DenseTensor& x,                \
+                    float attr1,                         \
+                    float attr2,                         \
+                    DenseTensor* out) {                  \
+    funcs::functor_class<T> functor;                     \
+    auto attrs = functor.GetAttrs();                     \
+    *(attrs[0].second) = attr1;                          \
+    *(attrs[1].second) = attr2;                          \
+    ActivationImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, x, out, functor);                       \
+  }
+
+DEFINE_CPU_ACTIVATION_KERNEL(Sin, SinFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Cos, CosFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Tan, TanFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Asin, AsinFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Atan, AtanFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Acos, AcosFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Sinh, SinhFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Cosh, CoshFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Asinh, AsinhFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Acosh, AcoshFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Atanh, AtanhFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Relu, ReluCPUFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Tanh, TanhFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(TanhShrink, TanhShrinkFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Silu, SiluFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Sigmoid, SigmoidFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(LogSigmoid, LogSigmoidFunctor)
+
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
+                                     ThresholdedReluFunctor,
+                                     threshold)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, HardShrinkFunctor, threshold)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha)
+
+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max)
+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
+                                     HardSigmoidFunctor,
+                                     slope,
+                                     offset)
 
 }  // namespace phi
-PD_REGISTER_KERNEL(sin, CPU, ALL_LAYOUT, phi::SinKernel, float, double) {}
-PD_REGISTER_KERNEL(cos, CPU, ALL_LAYOUT, phi::CosKernel, float, double) {}
-PD_REGISTER_KERNEL(tan, CPU, ALL_LAYOUT, phi::TanKernel, float, double) {}
-PD_REGISTER_KERNEL(acos, CPU, ALL_LAYOUT, phi::AcosKernel, float, double) {}
-PD_REGISTER_KERNEL(asin, CPU, ALL_LAYOUT, phi::AsinKernel, float, double) {}
-PD_REGISTER_KERNEL(atan, CPU, ALL_LAYOUT, phi::AtanKernel, float, double) {}
-PD_REGISTER_KERNEL(sinh, CPU, ALL_LAYOUT, phi::SinhKernel, float, double) {}
-PD_REGISTER_KERNEL(cosh, CPU, ALL_LAYOUT, phi::CoshKernel, float, double) {}
-PD_REGISTER_KERNEL(asinh, CPU, ALL_LAYOUT, phi::AsinhKernel, float, double) {}
-PD_REGISTER_KERNEL(acosh, CPU, ALL_LAYOUT, phi::AcoshKernel, float, double) {}
-PD_REGISTER_KERNEL(atanh, CPU, ALL_LAYOUT, phi::AtanhKernel, float, double) {}
 PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {}
+
+#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func, float, double) {}
+
+PD_REGISTER_ACTIVATION_KERNEL(sin, SinKernel)
+PD_REGISTER_ACTIVATION_KERNEL(cos, CosKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tan, TanKernel)
+PD_REGISTER_ACTIVATION_KERNEL(acos, AcosKernel)
+PD_REGISTER_ACTIVATION_KERNEL(asin, AsinKernel)
+PD_REGISTER_ACTIVATION_KERNEL(atan, AtanKernel)
+PD_REGISTER_ACTIVATION_KERNEL(sinh, SinhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(cosh, CoshKernel)
+PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel)
+PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel)
diff --git a/paddle/phi/kernels/cpu/allclose_kernel.cc b/paddle/phi/kernels/cpu/allclose_kernel.cc
new file mode 100644
index 0000000000000..7ffeadfeed8aa
--- /dev/null
+++ b/paddle/phi/kernels/cpu/allclose_kernel.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/allclose_kernel.h"
+
+#include <cmath>
+
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AllCloseKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const Scalar& rtol,
+                    const Scalar& atol,
+                    bool equal_nan,
+                    DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      rtol.dtype(),
+      DataType::FLOAT64,
+      phi::errors::InvalidArgument(
+          "Input (Rtol) type must be double, but get %s.", rtol.dtype()));
+  PADDLE_ENFORCE_EQ(
+      atol.dtype(),
+      DataType::FLOAT64,
+      phi::errors::InvalidArgument(
+          "Input (Atol) type must be double, but get %s.", atol.dtype()));
+
+  auto* in_a = x.data<T>();
+  auto* in_b = y.data<T>();
+  auto rtol_v = rtol.to<double>();
+  auto atol_v = atol.to<double>();
+  auto* out_data = dev_ctx.template Alloc<bool>(out);
+  *out_data = true;
+
+  auto num = x.numel();
+  for (int64_t i = 0; i < num; ++i) {
+    const T a = in_a[i], b = in_b[i];
+    bool val;
+    if (std::isnan(a) || std::isnan(b)) {
+      val = equal_nan && std::isnan(a) == std::isnan(b);
+    } else {
+      T left = (a > b ? a - b : b - a);
+      T right = atol_v + (b > 0 ? rtol_v * b : (-rtol_v) * b);
+      T diff = (left > right ? left - right : right - left);
+      val = a == b || left <= right || diff <= 1e-15;
+    }
+    *out_data &= val;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    allclose, CPU, ALL_LAYOUT, phi::AllCloseKernel, float, double) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
+}
diff --git a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
new file mode 100644
index 0000000000000..f4ad830e14932
--- /dev/null
+++ b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
@@ -0,0 +1,203 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/arg_min_max_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+enum ArgMinMaxType { kArgMin, kArgMax };
+
+template <typename Context,
+          typename T,
+          typename Tout,
+          int64_t Rank,
+          ArgMinMaxType argMinMaxValue>
+struct ArgMinMaxFunctor {};
+
+#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value)  \
+  template <typename Context, typename T, typename Tout, int64_t Rank>    \
+  struct ArgMinMaxFunctor<Context, T, Tout, Rank, enum_argminmax_value> { \
+    void operator()(const Context& dev_ctx,                               \
+                    const DenseTensor& in,                                \
+                    DenseTensor* out,                                     \
+                    phi::DDim x_dims,                                     \
+                    int64_t axis,                                         \
+                    bool keepdims) {                                      \
+      auto in_eigen = EigenTensor<T, Rank>::From(in, x_dims);             \
+      if (keepdims) {                                                     \
+        auto out_eigen = EigenTensor<Tout, Rank>::From(*out);             \
+        out_eigen.device(*(dev_ctx.eigen_device())) =                     \
+            in_eigen.eigen_op_type(axis).template cast<Tout>();           \
+      } else {                                                            \
+        auto out_eigen = EigenTensor<Tout, Rank - 1>::From(*out);         \
+        out_eigen.device(*(dev_ctx.eigen_device())) =                     \
+            in_eigen.eigen_op_type(axis).template cast<Tout>();           \
+      }                                                                   \
+    }                                                                     \
+  }
+
+DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin);
+DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax);
+
+template <typename Context, typename T, ArgMinMaxType EnumArgMinMaxValue>
+struct VisitDataArgMinMaxFunctor {
+  const Context& dev_ctx;
+  const DenseTensor& x;
+  int64_t axis;
+  bool keepdims;
+  bool flatten;
+  DenseTensor* out;
+
+  explicit VisitDataArgMinMaxFunctor(const Context& dev_ctx,
+                                     const DenseTensor& x,
+                                     int64_t axis,
+                                     bool keepdims,
+                                     bool flatten,
+                                     DenseTensor* out)
+      : dev_ctx(dev_ctx),
+        x(x),
+        axis(axis),
+        keepdims(keepdims),
+        flatten(flatten),
+        out(out) {}
+  template <typename Tout>
+  void apply() const {
+    dev_ctx.template Alloc<Tout>(out);
+    bool new_keepdims = keepdims;
+    if (flatten) new_keepdims = true;
+
+    // if flatten, will construct the new dims for the cacluate
+    phi::DDim x_dims;
+    int new_axis = axis;
+    if (flatten) {
+      x_dims = phi::make_ddim({x.numel()});
+      // if flatten, the axis just as 0
+      new_axis = 0;
+    } else {
+      x_dims = x.dims();
+      if (axis < 0) new_axis = axis + x_dims.size();
+    }
+
+#define CALL_ARG_MINMAX_FUNCTOR(rank)                                         \
+  ArgMinMaxFunctor<Context, T, Tout, rank, EnumArgMinMaxValue> functor##rank; \
+  functor##rank(dev_ctx, x, out, x_dims, new_axis, new_keepdims)
+
+    switch (x_dims.size()) {
+      case 1:
+        CALL_ARG_MINMAX_FUNCTOR(1);
+        break;
+      case 2:
+        CALL_ARG_MINMAX_FUNCTOR(2);
+        break;
+      case 3:
+        CALL_ARG_MINMAX_FUNCTOR(3);
+        break;
+      case 4:
+        CALL_ARG_MINMAX_FUNCTOR(4);
+        break;
+      case 5:
+        CALL_ARG_MINMAX_FUNCTOR(5);
+        break;
+      case 6:
+        CALL_ARG_MINMAX_FUNCTOR(6);
+        break;
+      default:
+        PADDLE_ENFORCE_LE(
+            x_dims.size(),
+            6,
+            phi::errors::InvalidArgument(
+                "%s operator doesn't supports tensors whose ranks are greater "
+                "than 6.",
+                (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax")));
+        break;
+#undef CALL_ARG_MINMAX_FUNCTOR
+    }
+  }
+};
+
+template <typename Context, typename T, ArgMinMaxType EnumArgMinMaxValue>
+void ArgMinMaxKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     int64_t axis,
+                     bool keepdims,
+                     bool flatten,
+                     int dtype,
+                     DenseTensor* out) {
+  if (dtype < 0) {
+    paddle::framework::VisitDataTypeTiny(
+        static_cast<paddle::framework::proto::VarType::Type>(
+            paddle::framework::proto::VarType::INT64),
+        VisitDataArgMinMaxFunctor<Context, T, EnumArgMinMaxValue>(
+            dev_ctx, x, axis, keepdims, flatten, out));
+    return;
+  }
+  paddle::framework::VisitDataTypeTiny(
+      static_cast<paddle::framework::proto::VarType::Type>(dtype),
+      VisitDataArgMinMaxFunctor<Context, T, EnumArgMinMaxValue>(
+          dev_ctx, x, axis, keepdims, flatten, out));
+}
+
+template <typename T, typename Context>
+void ArgMinKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out) {
+  ArgMinMaxKernel<Context, T, ArgMinMaxType::kArgMin>(
+      dev_ctx, x, axis, keepdims, flatten, dtype, out);
+}
+
+template <typename T, typename Context>
+void ArgMaxKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out) {
+  ArgMinMaxKernel<Context, T, ArgMinMaxType::kArgMax>(
+      dev_ctx, x, axis, keepdims, flatten, dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(arg_min,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ArgMinKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {}
+
+PD_REGISTER_KERNEL(arg_max,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ArgMaxKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/cpu/argsort_grad_kernel.cc b/paddle/phi/kernels/cpu/argsort_grad_kernel.cc
new file mode 100644
index 0000000000000..1e60847232c70
--- /dev/null
+++ b/paddle/phi/kernels/cpu/argsort_grad_kernel.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/argsort_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Type>
+static void FullAssign(Type input_height,
+                       Type input_width,
+                       int input_dim,
+                       const DenseTensor* input,
+                       const DenseTensor* indices,
+                       T* t_out) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      for (Type j = 0; j < input_width; ++j) {
+        t_out[i * input_width + e_indices(j)] = e_input(j);
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        t_out[i * input_width + e_indices(i, j)] = e_input(i, j);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ArgsortGradKernel(const Context& dev_ctx,
+                       const DenseTensor& indices,
+                       const DenseTensor& input,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       bool descending,
+                       DenseTensor* in_grad) {
+  auto in_dims = indices.dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  dev_ctx.template Alloc<T>(in_grad);
+  auto dxt = EigenVector<T>::Flatten(*in_grad);
+  auto& place = *dev_ctx.eigen_device();
+  dxt.device(place) = dxt.constant(static_cast<T>(0));
+  if (out_grad.numel() == 0) return;
+
+  // Do full assign
+  if (axis == -1 || axis + 1 == in_dims.size()) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+
+    FullAssign<T, int64_t>(input_height,
+                           input_width,
+                           in_dims.size(),
+                           &out_grad,
+                           &indices,
+                           in_grad->data<T>());
+  } else {
+    // If not full assign do transpose
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(axis);
+    phi::DDim trans_dims(in_dims);
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+
+    DenseTensor trans_dO;
+    trans_dO.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_dO);
+    DenseTensor trans_ind;
+    trans_ind.Resize(trans_dims);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    TransposeKernel<T, Context>(dev_ctx, out_grad, trans, &trans_dO);
+    TransposeKernel<int64_t, Context>(dev_ctx, indices, trans, &trans_ind);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+
+    FullAssign<T, int64_t>(input_height,
+                           input_width,
+                           in_dims.size(),
+                           &trans_dO,
+                           &trans_ind,
+                           t_out);
+
+    // transpose back
+    TransposeKernel<T, Context>(dev_ctx, tmp_out, trans, in_grad);
+  }
+}
+
+}  // namespace phi
+PD_REGISTER_KERNEL(argsort_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ArgsortGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/argsort_kernel.cc b/paddle/phi/kernels/cpu/argsort_kernel.cc
new file mode 100644
index 0000000000000..0e69afe38c9ad
--- /dev/null
+++ b/paddle/phi/kernels/cpu/argsort_kernel.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/argsort_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Type>
+static void FullSort(Type input_height,
+                     Type input_width,
+                     int input_dim,
+                     const DenseTensor* input,
+                     T* t_out,
+                     Type* t_indices,
+                     bool descending) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.push_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.push_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    std::sort(col_vec.begin(),
+              col_vec.end(),
+              [&](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                if (descending)
+                  return l.first > r.first;
+                else
+                  return l.first < r.first;
+              });
+
+    for (Type j = 0; j < input_width; ++j) {
+      t_out[i * input_width + j] = col_vec[j].first;
+      t_indices[i * input_width + j] = col_vec[j].second;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ArgsortKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   int axis,
+                   bool descending,
+                   DenseTensor* output,
+                   DenseTensor* indices) {
+  auto in_dims = input.dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  T* out_data = dev_ctx.template Alloc<T>(output);
+
+  // Do full sort
+  if (axis == -1 || axis + 1 == in_dims.size()) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+    int64_t* ids_data = dev_ctx.template Alloc<int64_t>(indices);
+    FullSort<T, int64_t>(input_height,
+                         input_width,
+                         in_dims.size(),
+                         &input,
+                         out_data,
+                         ids_data,
+                         descending);
+  } else {
+    // If not full sort do transpose
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(axis);
+    phi::DDim trans_dims(in_dims);
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+
+    DenseTensor trans_inp;
+    trans_inp.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_inp);
+    // Do transpose
+    TransposeKernel<T, Context>(dev_ctx, input, trans, &trans_inp);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+
+    DenseTensor tmp_indices;
+    tmp_indices.Resize(trans_dims);
+    auto* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
+
+    FullSort<T, int64_t>(input_height,
+                         input_width,
+                         in_dims.size(),
+                         &trans_inp,
+                         t_out,
+                         t_ind,
+                         descending);
+
+    dev_ctx.template Alloc<int64_t>(indices);
+    TransposeKernel<int64_t, Context>(dev_ctx, tmp_indices, trans, indices);
+    // transpose back
+    TransposeKernel<T, Context>(dev_ctx, tmp_out, trans, output);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    argsort, CPU, ALL_LAYOUT, phi::ArgsortKernel, float, double, int, int64_t) {
+}
diff --git a/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc b/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc
new file mode 100644
index 0000000000000..b6f5dd29ba2b7
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(cholesky_solve_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CholeskySolveGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc b/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc
new file mode 100644
index 0000000000000..02597560a7f51
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+
+namespace phi {
+
+template <typename T>
+class CholeskySolveFunctor<T, CPUContext> {
+ public:
+  void operator()(const CPUContext &dev_ctx,
+                  bool upper,
+                  int M,
+                  int N,
+                  T *Adata,
+                  int lda,
+                  T *Bdata,
+                  int *devInfo) {
+    char uplo = upper ? 'U' : 'L';
+    funcs::lapackCholeskySolve<T>(uplo, M, N, Adata, lda, Bdata, lda, devInfo);
+  }
+};
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    cholesky_solve, CPU, ALL_LAYOUT, phi::CholeskySolveKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc
new file mode 100644
index 0000000000000..8d0749500695c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DepthwiseConv2dTransposeGradKernel(const Context& ctx,
+                                        const DenseTensor& x,
+                                        const DenseTensor& filter,
+                                        const DenseTensor& dout,
+                                        const std::vector<int>& strides,
+                                        const std::vector<int>& paddings,
+                                        const std::vector<int>& output_padding,
+                                        const std::vector<int>& output_size,
+                                        const std::string& padding_algorithm,
+                                        int groups,
+                                        const std::vector<int>& dilations,
+                                        const std::string& data_format,
+                                        DenseTensor* dx,
+                                        DenseTensor* dfilter) {
+  ConvTransposeGradRawKernel<T, Context>(ctx,
+                                         x,
+                                         filter,
+                                         dout,
+                                         strides,
+                                         paddings,
+                                         padding_algorithm,
+                                         groups,
+                                         dilations,
+                                         data_format,
+                                         dx,
+                                         dfilter);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(conv2d_transpose_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(conv3d_transpose_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Conv3dTransposeGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(depthwise_conv2d_transpose_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConv2dTransposeGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/conv_transpose_kernel.cc b/paddle/phi/kernels/cpu/conv_transpose_kernel.cc
new file mode 100644
index 0000000000000..b4cacc850938e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/conv_transpose_kernel.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_transpose_kernel.h"
+#include "paddle/phi/kernels/impl/conv_transpose_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DepthwiseConv2dTransposeKernel(const Context& ctx,
+                                    const DenseTensor& x,
+                                    const DenseTensor& filter,
+                                    const std::vector<int>& strides,
+                                    const std::vector<int>& paddings,
+                                    const std::vector<int>& output_padding,
+                                    const std::vector<int>& output_size,
+                                    const std::string& padding_algorithm,
+                                    int groups,
+                                    const std::vector<int>& dilations,
+                                    const std::string& data_format,
+                                    DenseTensor* out) {
+  ConvTransposeRawKernel<T, Context>(ctx,
+                                     x,
+                                     filter,
+                                     strides,
+                                     paddings,
+                                     padding_algorithm,
+                                     groups,
+                                     dilations,
+                                     data_format,
+                                     out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(conv2d_transpose,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(conv3d_transpose,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Conv3dTransposeKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(depthwise_conv2d_transpose,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConv2dTransposeKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/copy_kernel.cc b/paddle/phi/kernels/cpu/copy_kernel.cc
index 1af071f23ddc5..fa11fd05bf1d6 100644
--- a/paddle/phi/kernels/cpu/copy_kernel.cc
+++ b/paddle/phi/kernels/cpu/copy_kernel.cc
@@ -38,7 +38,7 @@ void Copy(const Context& dev_ctx,
           << src_place;
 
   dst->Resize(src.dims());
-  auto* dst_ptr = dev_ctx.Alloc(dst, src.dtype());
+  auto* dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
 
   if (src_ptr == dst_ptr) {
     VLOG(3) << "Skip copy the same data async from " << src_place << " to "
diff --git a/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc b/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc
new file mode 100644
index 0000000000000..a25f9650fc50f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumprod_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/cumprod.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+// NOTE(@xiongkun): use of IsComplex<>
+#include "paddle/fluid/framework/data_type.h"
+
+namespace phi {
+template <typename T, typename Context>
+void CumprodGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& out,
+                       const DenseTensor& d_out,
+                       int dim,
+                       DenseTensor* d_x) {
+  DDim shape = x.dims();
+
+  auto* d_out_data = d_out.data<T>();
+  auto* x_data = x.data<T>();
+  auto* out_data = out.data<T>();
+  auto* d_x_data = dev_ctx.template Alloc<T>(d_x);
+
+  size_t outer_dim = 1;
+  size_t mid_dim = 1;
+  size_t inner_dim = 1;
+  GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim);
+  size_t numel = outer_dim * mid_dim * inner_dim;
+
+  // deal with complex
+  const T* x_data_deal;
+  const T* out_data_deal;
+  Allocator::AllocationPtr x_conj;
+  Allocator::AllocationPtr out_conj;
+  if (paddle::framework::IsComplex<T>::value) {
+    x_conj = const_cast<Allocator&>(dev_ctx.GetAllocator())
+                 .Allocate(numel * sizeof(T));
+    auto* x_data_conj = reinterpret_cast<T*>(x_conj->ptr());
+    out_conj = const_cast<Allocator&>(dev_ctx.GetAllocator())
+                   .Allocate(numel * sizeof(T));
+    auto* out_data_conj = reinterpret_cast<T*>(out_conj->ptr());
+
+    phi::funcs::ForRange<Context> for_range_x(dev_ctx, numel);
+    phi::funcs::ConjFunctor<T> functor_x(x_data, numel, x_data_conj);
+    for_range_x(functor_x);
+
+    phi::funcs::ForRange<Context> for_range_out(dev_ctx, numel);
+    phi::funcs::ConjFunctor<T> functor_out(out_data, numel, out_data_conj);
+    for_range_out(functor_out);
+
+    x_data_deal = x_data_conj;
+    out_data_deal = out_data_conj;
+  } else {
+    x_data_deal = x_data;
+    out_data_deal = out_data;
+  }
+
+  for (size_t i = 0; i < outer_dim; i++) {
+    for (size_t k = 0; k < inner_dim; k++) {
+      for (size_t j = 0; j < mid_dim; j++) {
+        size_t index = i * mid_dim * inner_dim + j * inner_dim + k;
+        d_x_data[index] = 0;
+        for (size_t n = 0; n < mid_dim; n++) {
+          size_t pos = i * mid_dim * inner_dim + n * inner_dim + k;
+          T elem;
+          if (j == 0) {
+            elem = d_out_data[pos];
+          } else {
+            elem = d_out_data[pos] * out_data_deal[index - inner_dim];
+          }
+          if (pos > index) {
+            for (size_t m = index + inner_dim; m <= pos; m += inner_dim) {
+              elem *= x_data_deal[m];
+            }
+          } else if (pos < index) {
+            elem = static_cast<T>(0);
+          }
+          d_x_data[index] += elem;
+        }
+      }
+    }
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(cumprod_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CumprodGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/cumprod_kernel.cc b/paddle/phi/kernels/cpu/cumprod_kernel.cc
new file mode 100644
index 0000000000000..aea338027f5bb
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cumprod_kernel.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumprod_kernel.h"
+
+#include <cstdint>
+#include <type_traits>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/cumprod.h"
+
+namespace phi {
+template <typename T, typename Context>
+void CumprodKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   int dim,
+                   DenseTensor* out) {
+  const DenseTensor* x = &input;
+  auto* x_data = x->data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+  DDim shape = x->dims();
+
+  size_t outer_dim = 1;
+  size_t mid_dim = 1;
+  size_t inner_dim = 1;
+  GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim);
+
+  for (size_t i = 0; i < outer_dim; i++) {
+    for (size_t j = 0; j < mid_dim; j++) {
+      for (size_t k = 0; k < inner_dim; k++) {
+        size_t pos = i * mid_dim * inner_dim + j * inner_dim + k;
+        if (j == 0) {
+          out_data[pos] = x_data[pos];
+        } else {
+          out_data[pos] = out_data[pos - inner_dim] * x_data[pos];
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cumprod,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CumprodKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/deformable_conv_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_kernel.cc
new file mode 100644
index 0000000000000..0d61f7be68af9
--- /dev/null
+++ b/paddle/phi/kernels/cpu/deformable_conv_kernel.cc
@@ -0,0 +1,146 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/deformable_conv_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h"
+
+namespace phi {
+
+template <typename T>
+inline void ModulatedDeformableIm2colCPUKernel(
+    const int num_kernels,
+    const T* data_im,
+    const T* data_offset,
+    const T* data_mask,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int num_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    T* data_col) {
+  for (int i = 0; i < num_kernels; i++) {
+    const int w_col = i % width_col;
+    const int h_col = (i / width_col) % height_col;
+    const int b_col = (i / width_col) / height_col % batch_size;
+    const int c_im = (i / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    T* data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T* data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T* data_offset_ptr =
+        data_offset +
+        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const T* data_mask_ptr =
+        data_mask +
+        (b_col * deformable_group + deformable_group_index) * kernel_h *
+            kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
+          val =
+              DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ModulatedDeformableIm2col(const Context& dev_ctx,
+                               const T* data_im,
+                               const T* data_offset,
+                               const T* data_mask,
+                               const std::vector<int64_t>& im_shape,
+                               const std::vector<int64_t>& col_shape,
+                               const std::vector<int64_t>& filter_shape,
+                               const std::vector<int>& paddings,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& dilations,
+                               const int deformable_groups,
+                               T* data_col) {
+  int channel_per_deformable_group = im_shape[0] / deformable_groups;
+  int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
+
+  // get outputs of im2col with offset by bilinear interpolation
+  ModulatedDeformableIm2colCPUKernel(num_kernels,
+                                     data_im,
+                                     data_offset,
+                                     data_mask,
+                                     im_shape[1],
+                                     im_shape[2],
+                                     filter_shape[2],
+                                     filter_shape[3],
+                                     paddings[0],
+                                     paddings[1],
+                                     strides[0],
+                                     strides[1],
+                                     dilations[0],
+                                     dilations[1],
+                                     channel_per_deformable_group,
+                                     col_shape[1],
+                                     im_shape[0],
+                                     deformable_groups,
+                                     col_shape[2],
+                                     col_shape[3],
+                                     data_col);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(deformable_conv,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DeformableConvKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/determinant_grad_kernel.cc b/paddle/phi/kernels/cpu/determinant_grad_kernel.cc
new file mode 100644
index 0000000000000..e57d7263f88bf
--- /dev/null
+++ b/paddle/phi/kernels/cpu/determinant_grad_kernel.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/determinant_grad_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(determinant_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DeterminantGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/phi/kernels/cpu/determinant_kernel.cc
similarity index 63%
rename from paddle/fluid/operators/reduce_ops/reduce_any_op.cu
rename to paddle/phi/kernels/cpu/determinant_kernel.cc
index 2e93e67debbd9..5810e88e92527 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+++ b/paddle/phi/kernels/cpu/determinant_kernel.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include "paddle/phi/kernels/determinant_kernel.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_any,
-    ops::ReduceCudaKernel<bool, kps::LogicalOrFunctor, kps::IdentityFunctor>);
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/determinant_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    determinant, CPU, ALL_LAYOUT, phi::DeterminantKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/diag_grad_kernel.cc b/paddle/phi/kernels/cpu/diag_grad_kernel.cc
new file mode 100644
index 0000000000000..c56b225e2a753
--- /dev/null
+++ b/paddle/phi/kernels/cpu/diag_grad_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/diag_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DiagGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    int offset,
+                    DenseTensor* x_grad) {
+  T* dx_data = dev_ctx.template Alloc<T>(x_grad);
+  const T* dout_data = out_grad.data<T>();
+  auto dx_dims = x_grad->dims();
+  auto dout_dims = out_grad.dims();
+
+  if (dx_dims.size() == 1) {
+    auto dx_length = dx_dims[0];
+    int dx_stride = phi::funcs::ComputeStride(0, dx_dims);
+
+    auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims);
+    auto dout_stride_1 = phi::funcs::ComputeStride(1, dout_dims);
+    dout_data +=
+        (offset >= 0 ? offset * dout_stride_1 : -offset * dout_stride_0);
+
+    for (int i = 0; i < dx_length; i++) {
+      dx_data[i * dx_stride] = dout_data[i * (dout_stride_0 + dout_stride_1)];
+    }
+  } else {
+    phi::funcs::SetConstant<Context, T> set_padding_value;
+    set_padding_value(dev_ctx, x_grad, static_cast<T>(0));
+
+    int dx_stride_0 = phi::funcs::ComputeStride(0, dx_dims);
+    int dx_stride_1 = phi::funcs::ComputeStride(1, dx_dims);
+    auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims);
+    dx_data += (offset >= 0 ? offset * dx_stride_1 : -offset * dx_stride_0);
+
+    auto dout_length = dout_dims[0];
+    for (int i = 0; i < dout_length; i++) {
+      dx_data[i * (dx_stride_0 + dx_stride_1)] = dout_data[i * dout_stride_0];
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(diag_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DiagGradKernel,
+                   phi::dtype::float16,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/diag_kernel.cc b/paddle/phi/kernels/cpu/diag_kernel.cc
index d1e0b8e31e78f..4b060f0372a5b 100644
--- a/paddle/phi/kernels/cpu/diag_kernel.cc
+++ b/paddle/phi/kernels/cpu/diag_kernel.cc
@@ -62,5 +62,12 @@ void DiagKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    diag, CPU, ALL_LAYOUT, phi::DiagKernel, int, float, double, int64_t) {}
+PD_REGISTER_KERNEL(diag,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DiagKernel,
+                   phi::dtype::float16,
+                   int,
+                   float,
+                   double,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/dropout_grad_kernel.cc b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
new file mode 100644
index 0000000000000..b77a6c55b1471
--- /dev/null
+++ b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/dropout_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutGradRawKernel(const Context& dev_ctx,
+                          const DenseTensor& mask,
+                          const DenseTensor& out_grad,
+                          float p,
+                          bool is_test,
+                          const std::string& mode,
+                          DenseTensor* x_grad) {
+  auto* grad_x = x_grad;
+  auto* grad_y = &out_grad;
+  grad_x->mutable_data<T>(dev_ctx.GetPlace());
+
+  auto dX = EigenVector<T>::Flatten(*grad_x);
+  auto dY = EigenVector<T>::Flatten(*grad_y);
+
+  auto& place = *dev_ctx.eigen_device();
+  auto& dropout_implementation = mode;
+  if (is_test == true) {
+    if (dropout_implementation == "upscale_in_train") {
+      dX.device(place) = static_cast<T>(1) * dY;
+    } else {
+      dX.device(place) = dY * static_cast<T>(1.0f - p);
+    }
+  } else {
+    auto M = EigenVector<uint8_t>::Flatten(mask);
+    if (dropout_implementation == "upscale_in_train") {
+      if (p == 1.0f) {
+        dX.device(place) = static_cast<T>(0) * dY;
+      } else {
+        dX.device(place) = dY * M.cast<T>() / static_cast<T>(1.0f - p);
+      }
+    } else {
+      dX.device(place) = dY * M.cast<T>();
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dropout_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DropoutGradRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/dropout_kernel.cc b/paddle/phi/kernels/cpu/dropout_kernel.cc
new file mode 100644
index 0000000000000..c00aedef8c67d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/dropout_kernel.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/dropout_kernel.h"
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      paddle::optional<const DenseTensor&> seed_tensor,
+                      float p,
+                      bool is_test,
+                      const std::string& mode,
+                      int seed,
+                      bool fix_seed,
+                      DenseTensor* out,
+                      DenseTensor* mask) {
+  auto* y = out;
+  const auto* x_data = x.data<T>();
+  auto* y_data = y->mutable_data<T>(dev_ctx.GetPlace());
+  float dropout_prob = p;
+
+  auto& dropout_implementation = mode;
+  bool upscale_in_train = (dropout_implementation == "upscale_in_train");
+  if (!is_test) {
+    auto* mask_data = mask->mutable_data<uint8_t>(dev_ctx.GetPlace());
+    size_t size = phi::product(mask->dims());
+
+    // Special case when dropout_prob is 1.0
+    if (dropout_prob == 1.0f) {
+      std::memset(y_data, 0, size * sizeof(*y_data));        // NOLINT
+      std::memset(mask_data, 0, size * sizeof(*mask_data));  // NOLINT
+      return;
+    }
+    // std::minstd_rand engine;
+    // NOTE: fixed seed should only be used in unittest or for debug.
+    // Guarantee to use random seed in training.
+    int seed_data = 0;
+    if (seed_tensor.get_ptr() != nullptr) {
+      seed_data = *(seed_tensor->data<int>());
+    } else {
+      seed_data = fix_seed ? seed : 0;
+    }
+    auto engine = paddle::framework::GetCPURandomEngine(seed_data);
+
+    std::uniform_real_distribution<float> dist(0, 1);
+
+    for (size_t i = 0; i < size; ++i) {
+      if (dist(*engine) < dropout_prob) {
+        mask_data[i] = 0;
+        y_data[i] = 0;
+      } else {
+        mask_data[i] = 1;
+        if (upscale_in_train) {
+          y_data[i] = x_data[i] / static_cast<T>(1.0f - dropout_prob);
+        } else {
+          y_data[i] = x_data[i];
+        }
+      }
+    }
+  } else {
+    if (upscale_in_train) {
+      const auto* X_data = x.data<T>();
+      auto* Y_data = y->mutable_data<T>(dev_ctx.GetPlace());
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+      for (int i = 0; i < x.numel(); i++) {
+        Y_data[i] = X_data[i];
+      }
+    } else {
+      auto X = EigenMatrix<T>::Reshape(x, 1);
+      auto Y = EigenMatrix<T>::Reshape(*y, 1);
+      auto& place = *dev_ctx.eigen_device();
+      Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dropout,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DropoutRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
index c9177f1c46eac..bf6ec012b2444 100644
--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -121,6 +121,20 @@ void DivideGradKernel(const Context& dev_ctx,
       dev_ctx, x, y, out, dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
 }
 
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  auto* out = &dout;  // out is not necessary
+  phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+      dev_ctx, x, y, *out, dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(add_grad,
@@ -193,8 +207,8 @@ PD_REGISTER_KERNEL(divide_grad,
                    double,
                    int,
                    int64_t,
-                   paddle::platform::complex<float>,
-                   paddle::platform::complex<double>) {}
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
 
 PD_REGISTER_KERNEL(divide_double_grad,
                    CPU,
@@ -204,5 +218,61 @@ PD_REGISTER_KERNEL(divide_double_grad,
                    double,
                    int,
                    int64_t,
-                   paddle::platform::complex<float>,
-                   paddle::platform::complex<double>) {}
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_triple_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyTripleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+PD_REGISTER_KERNEL(elementwise_fmax_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMaxGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(elementwise_fmin_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMinGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc
similarity index 75%
rename from paddle/phi/kernels/cpu/math_kernel.cc
rename to paddle/phi/kernels/cpu/elementwise_kernel.cc
index 250f656926c05..095d11720ce26 100644
--- a/paddle/phi/kernels/cpu/math_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -1,4 +1,4 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,22 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/math_kernel.h"
-
+#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/scalar.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/cpu/elementwise.h"
-#include "paddle/phi/kernels/cpu/reduce.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/funcs/reduce_functor.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 
 namespace phi {
 
@@ -55,30 +46,6 @@ namespace phi {
     }                                                                       \
   }
 
-template <typename T, typename Context>
-void MeanRawKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const std::vector<int64_t>& dims,
-                   bool keep_dim,
-                   bool reduce_all,
-                   DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  phi::Reduce<CPUContext, T, phi::funcs::MeanFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-template <typename T, typename Context>
-void SumRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
-                  bool keep_dim,
-                  bool reduce_all,
-                  DataType out_dtype,
-                  DenseTensor* out) {
-  phi::Reduce<CPUContext, T, phi::funcs::SumFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
 template <typename T, typename Context>
 void DivideRawKernel(const Context& dev_ctx,
                      const DenseTensor& x,
@@ -119,6 +86,25 @@ using complex128 = ::phi::dtype::complex<double>;
 
 // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
 // using bfloat16 = ::phi::dtype::bfloat16;
+
+PD_REGISTER_KERNEL(elementwise_fmax,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMaxKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(elementwise_fmin,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMinKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
 PD_REGISTER_KERNEL(add_raw,
                    CPU,
                    ALL_LAYOUT,
@@ -164,20 +150,3 @@ PD_REGISTER_KERNEL(multiply_raw,
                    complex64,
                    complex128,
                    phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(sum_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SumRawKernel,
-                   bool,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
-PD_REGISTER_KERNEL(
-    mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {}
diff --git a/paddle/phi/kernels/cpu/erf_grad_kernel.cc b/paddle/phi/kernels/cpu/erf_grad_kernel.cc
new file mode 100644
index 0000000000000..3c1cd0df1531a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/erf_grad_kernel.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/erf_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/erf_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(erf_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ErfGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/phi/kernels/cpu/erf_kernel.cc
similarity index 50%
rename from paddle/fluid/operators/arg_min_op.cu
rename to paddle/phi/kernels/cpu/erf_kernel.cc
index 23170bf008790..05ce4cab7fcef 100644
--- a/paddle/fluid/operators/arg_min_op.cu
+++ b/paddle/phi/kernels/cpu/erf_kernel.cc
@@ -1,21 +1,22 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.cu.h"
-REGISTER_OP_CUDA_KERNEL(
-    arg_min, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMin>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMin>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMin>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMin>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMin>);
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/erf_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/erf_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    erf, CPU, ALL_LAYOUT, phi::ErfKernel, float, double, phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc b/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc
similarity index 55%
rename from paddle/phi/kernels/cpu/reduce_prod_kernel.cc
rename to paddle/phi/kernels/cpu/expand_as_grad_kernel.cc
index cf0179124ebdf..6eafe9aa49dfe 100644
--- a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc
+++ b/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc
@@ -12,32 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_prod_kernel.h"
+#include "paddle/phi/kernels/expand_as_grad_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/cpu/reduce.h"
-#include "paddle/phi/kernels/funcs/reduce_functor.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void ReduceProdKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const std::vector<int64_t>& dims,
-                      bool keep_dim,
-                      bool reduce_all,
-                      DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  phi::Reduce<CPUContext, T, phi::funcs::ProdFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-}  // namespace phi
 
-PD_REGISTER_KERNEL(reduce_prod,
+PD_REGISTER_KERNEL(expand_as_grad,
                    CPU,
                    ALL_LAYOUT,
-                   phi::ReduceProdKernel,
+                   phi::ExpandAsGradKernel,
                    float,
                    double,
                    int,
diff --git a/paddle/phi/kernels/cpu/expand_as_kernel.cc b/paddle/phi/kernels/cpu/expand_as_kernel.cc
new file mode 100644
index 0000000000000..697ea138097ee
--- /dev/null
+++ b/paddle/phi/kernels/cpu/expand_as_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/expand_as_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(expand_as,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ExpandAsKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc
new file mode 100644
index 0000000000000..338be9e252da3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/frobenius_norm_grad_kernel.h"
+#include "paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(frobenius_norm_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FrobeniusNormGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc b/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc
new file mode 100644
index 0000000000000..77509b953bf39
--- /dev/null
+++ b/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/frobenius_norm_kernel.h"
+#include "paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    frobenius_norm, CPU, ALL_LAYOUT, phi::FrobeniusNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/gather_grad_kernel.cc b/paddle/phi/kernels/cpu/gather_grad_kernel.cc
new file mode 100644
index 0000000000000..f0a6948018afc
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gather_grad_kernel.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_grad_kernel.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& index,
+                      const DenseTensor& out_grad,
+                      const Scalar& axis,
+                      bool overwrite,
+                      DenseTensor* x_grad) {
+  const auto& index_type = index.dtype();
+  auto axis_v = axis.to<int>();
+
+  if (axis_v != 0) {
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GatherV2GradFunction<T, int32_t>(
+          dev_ctx, &out_grad, &index, axis_v, x_grad);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::GatherV2GradFunction<T, int64_t>(
+          dev_ctx, &out_grad, &index, axis_v, x_grad);
+    }
+    return;
+  }
+
+  dev_ctx.template Alloc<T>(x_grad);
+
+  auto dxt = EigenVector<T>::Flatten(*x_grad);
+  auto& place = *dev_ctx.eigen_device();
+  dxt.device(place) = dxt.constant(static_cast<T>(0));
+  if (x_grad->numel() == 0) return;
+
+  if (index_type == phi::DataType::INT32) {
+    if (overwrite) {
+      phi::funcs::ScatterAssign<T, int32_t>(dev_ctx, out_grad, index, x_grad);
+    } else {
+      phi::funcs::ScatterAssignAdd<T, int32_t>(
+          dev_ctx, out_grad, index, x_grad);
+    }
+  } else if (index_type == phi::DataType::INT64) {
+    if (overwrite) {
+      phi::funcs::ScatterAssign<T, int64_t>(dev_ctx, out_grad, index, x_grad);
+    } else {
+      phi::funcs::ScatterAssignAdd<T, int64_t>(
+          dev_ctx, out_grad, index, x_grad);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GatherGradKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/gather_kernel.cc b/paddle/phi/kernels/cpu/gather_kernel.cc
new file mode 100644
index 0000000000000..9207a05b9dcce
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gather_kernel.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_kernel.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& index,
+                  const Scalar& axis,
+                  DenseTensor* out) {
+  const auto& index_type = index.dtype();
+  auto axis_v = axis.to<int>();
+  if (axis_v != 0) {
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GatherV2Function<T, int32_t>(
+          dev_ctx, &x, &index, axis_v, out);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::GatherV2Function<T, int64_t>(
+          dev_ctx, &x, &index, axis_v, out);
+    }
+    return;
+  }
+
+  dev_ctx.template Alloc<T>(out);
+
+  if (x.numel() == 0) {
+    return;
+  }
+
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::CPUGather<T, int>(dev_ctx, x, index, out);
+  } else if (index_type == phi::DataType::INT64) {
+    phi::funcs::CPUGather<T, int64_t>(dev_ctx, x, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GatherKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
new file mode 100644
index 0000000000000..254c4ea5716d1
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
@@ -0,0 +1,146 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gelu_grad_kernel.h"
+
+#include <algorithm>
+#include <cmath>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/blas/blas_impl.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/gelu_kernel.h"
+
+namespace phi {
+
+template <typename T>
+struct GeluGradFunctor {
+  template <typename Device, typename X, typename dOut, typename dX>
+  void operator()(Device d, X x, dOut dout, dX dx, bool approximate) const {
+    if (approximate) {
+      if (std::is_same<T, dtype::float16>::value) {
+        VLOG(4) << "cast from float16 to float before computing";
+        auto casted_x = x.template cast<float>();
+        auto casted_dout = dout.template cast<float>();
+
+        const float kAlpha = static_cast<float>(M_2_SQRTPI * M_SQRT1_2);
+        const float kBeta =
+            kAlpha * static_cast<float>(GELU_CONSTANT) * static_cast<float>(3);
+        const auto y =
+            (kAlpha *
+             ((static_cast<float>(GELU_CONSTANT) * casted_x.cube()) + casted_x))
+                .tanh();
+        dx.device(d) = (static_cast<float>(0.5) * casted_dout *
+                        (static_cast<float>(1) + y +
+                         (casted_x - casted_x * y.square()) *
+                             (kAlpha + kBeta * casted_x.square())))
+                           .template cast<T>();
+      } else {
+        const T kAlpha = static_cast<T>(M_2_SQRTPI * M_SQRT1_2);
+        const T kBeta =
+            kAlpha * static_cast<T>(GELU_CONSTANT) * static_cast<T>(3);
+        const auto y =
+            (kAlpha * ((static_cast<T>(GELU_CONSTANT) * x.cube()) + x)).tanh();
+        dx.device(d) = static_cast<T>(0.5) * dout *
+                       (static_cast<T>(1) + y +
+                        (x - x * y.square()) * (kAlpha + kBeta * x.square()));
+      }
+    } else {
+#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
+    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
+    !defined(PADDLE_WITH_HIP)
+      auto x_data = x.data();
+      auto dx_data = dx.data();
+      auto dout_data = dout.data();
+      int n = std::min(x.size(), dx.size());
+
+      auto first = static_cast<T*>(std::malloc(n * sizeof(T)));
+      std::memset(first, 0, n * sizeof(T));
+      auto second = static_cast<T*>(std::malloc(n * sizeof(T)));
+      std::memset(second, 0, n * sizeof(T));
+
+      // first = (0.5 * (1 + erf(x / sqrt(2))))
+      phi::funcs::CBlas<T>::AXPY(
+          n, static_cast<T>(M_SQRT1_2), x_data, 1, first, 1);
+      phi::funcs::CBlas<T>::VMERF(n, first, first, VML_LA);
+      for (int i = 0; i < n; i++) {
+        first[i] += static_cast<T>(1);
+      }
+      phi::funcs::CBlas<T>::SCAL(n, static_cast<T>(0.5), first, 1);
+
+      // second = (0.5 * 2/sqrt(pi) * 1/sqrt(2) * x * exp(-0.5 * x^2))
+      phi::funcs::CBlas<T>::VSQUARE(n, x_data, second);
+      phi::funcs::CBlas<T>::SCAL(n, -static_cast<T>(0.5), second, 1);
+      phi::funcs::CBlas<T>::VEXP(n, second, second);
+      phi::funcs::CBlas<T>::VMUL(n, x_data, second, second);
+      phi::funcs::CBlas<T>::SCAL(
+          n, static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2), second, 1);
+
+      // dx = dout * (first + second);
+      phi::funcs::CBlas<T>::VADD(n, first, second, first);
+      phi::funcs::CBlas<T>::VMUL(n, dout_data, first, dx_data);
+
+      std::free(first);
+      std::free(second);
+#else
+      // gelu_grad(x) = dout * 0.5 * (1 + erf(x / sqrt(2)) + x * sqrt(2 / pi) *
+      // exp(- x^2 / 2)
+      if (std::is_same<T, dtype::float16>::value) {
+        VLOG(4) << "cast from float16 to float before computing";
+        auto casted_x = x.template cast<float>();
+        auto casted_dout = dout.template cast<float>();
+        auto first = static_cast<float>(0.5) *
+                     (static_cast<float>(1) +
+                      ((casted_x * static_cast<float>(M_SQRT1_2)).erf()));
+        auto second = static_cast<float>(0.5 * M_2_SQRTPI * M_SQRT1_2) *
+                      casted_x *
+                      (-static_cast<float>(0.5) * casted_x.square()).exp();
+        dx.device(d) = (casted_dout * (first + second)).template cast<T>();
+      } else {
+        auto first =
+            static_cast<T>(0.5) *
+            (static_cast<T>(1) + ((x * static_cast<T>(M_SQRT1_2)).erf()));
+
+        auto second = static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2) * x *
+                      (-static_cast<T>(0.5) * x.square()).exp();
+        dx.device(d) = dout * (first + second);
+      }
+#endif
+    }
+  }
+};
+
+template <typename T, typename Context>
+void GeluGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    bool approximate,
+                    DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  auto eigen_x = EigenVector<T>::Flatten(x);
+  auto eigen_out_grad = EigenVector<T>::Flatten(out_grad);
+  auto eigen_x_grad = EigenVector<T>::Flatten(*x_grad);
+  auto& dev = *dev_ctx.eigen_device();
+
+  GeluGradFunctor<T> functor;
+  functor(dev, eigen_x, eigen_out_grad, eigen_x_grad, approximate);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    gelu_grad, CPU, ALL_LAYOUT, phi::GeluGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/gelu_kernel.cc b/paddle/phi/kernels/cpu/gelu_kernel.cc
new file mode 100644
index 0000000000000..d7af220574565
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gelu_kernel.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gelu_kernel.h"
+#include <algorithm>
+#include <cmath>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/blas/blas_impl.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T>
+struct GeluFunctor {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out, bool approximate) const {
+    if (approximate) {
+      // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2 / \pi) * (x + 0.044715 * x^{3})))
+      if (std::is_same<T, dtype::float16>::value) {
+        VLOG(4) << "cast from float16 to float before computing";
+        auto casted_x = x.template cast<float>();
+        auto temp =
+            (static_cast<float>(M_2_SQRTPI * M_SQRT1_2) *
+             (casted_x + static_cast<float>(GELU_CONSTANT) * casted_x.cube()))
+                .tanh();
+        out.device(d) = (casted_x * static_cast<float>(0.5) *
+                         (static_cast<float>(1) + temp))
+                            .template cast<T>();
+      } else {
+        auto temp = (static_cast<T>(M_2_SQRTPI * M_SQRT1_2) *
+                     (x + static_cast<T>(GELU_CONSTANT) * x.cube()))
+                        .tanh();
+        out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
+      }
+    } else {
+#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
+    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
+    !defined(PADDLE_WITH_HIP)
+      auto x_data = x.data();
+      auto out_data = out.data();
+      int n = std::min(x.size(), out.size());
+
+      std::memset(out_data, 0, n * sizeof(T));
+      phi::funcs::CBlas<T>::AXPY(
+          n, static_cast<T>(M_SQRT1_2), x_data, 1, out_data, 1);
+      phi::funcs::CBlas<T>::VMERF(n, out_data, out_data, VML_LA);
+      for (int i = 0; i < n; i++) {
+        out_data[i] += static_cast<T>(1);
+      }
+      phi::funcs::CBlas<T>::VMUL(n, x_data, out_data, out_data);
+      for (int i = 0; i < n; i++) {
+        out_data[i] *= static_cast<T>(0.5);
+      }
+#else
+      // gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
+      if (std::is_same<T, dtype::float16>::value) {
+        VLOG(4) << "cast from float16 to float before computing";
+        auto casted_x = x.template cast<float>();
+        auto temp = (casted_x * static_cast<float>(M_SQRT1_2)).erf();
+        out.device(d) = (casted_x * static_cast<float>(0.5) *
+                         (static_cast<float>(1) + temp))
+                            .template cast<T>();
+      } else {
+        auto temp = (x * static_cast<T>(M_SQRT1_2)).erf();
+        out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
+      }
+#endif
+    }
+  }
+};
+
+template <typename T, typename Context>
+void GeluKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                bool approximate,
+                DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  auto eigen_out = EigenVector<T>::Flatten(*out);
+  auto eigen_x = EigenVector<T>::Flatten(x);
+  auto& dev = *dev_ctx.eigen_device();
+
+  GeluFunctor<T> functor;
+  functor(dev, eigen_x, eigen_out, approximate);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gelu, CPU, ALL_LAYOUT, phi::GeluKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc
new file mode 100644
index 0000000000000..923cb8424115e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc
@@ -0,0 +1,357 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/grid_sample_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+static inline void ClipWithMask(const CPUContext& ctx,
+                                const int max_val,  // height-1 or width-1
+                                bool align_corners,
+                                std::string padding_mode,
+                                DenseTensor* grid_slice,
+                                DenseTensor* grid_scale) {
+  auto& place = *ctx.eigen_device();
+  grid_scale->Resize(grid_slice->dims());
+  ctx.Alloc<T>(grid_scale);
+
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+  auto factor = static_cast<T>(max_val * 0.5);
+  if (!align_corners) {
+    factor = static_cast<T>((max_val + 1) * 0.5);
+  }
+  auto grid_scale_t = EigenTensor<T, 3>::From(*grid_scale).setConstant(factor);
+
+  if (padding_mode == "border") {
+    //    auto bounded_lo = grid_slice_t.cwiseMax(static_cast<T>(0));
+    auto res = grid_slice_t.cwiseMax(static_cast<T>(0))
+                   .cwiseMin(static_cast<T>(max_val));
+
+    auto in_bound = (res == grid_slice_t);
+    grid_scale_t.device(place) = grid_scale_t * in_bound.template cast<T>();
+    grid_slice_t.device(place) = res;
+  } else if (padding_mode == "reflection") {
+    if (align_corners) {
+      auto double_range = static_cast<T>(max_val * 2);
+      auto is_neg = (grid_slice_t < static_cast<T>(0));
+      auto grid_abs = grid_slice_t.abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      auto one_more_flip = (extra > (double_range - extra));
+      grid_scale_t.device(place) =
+          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
+                          (is_neg != one_more_flip).template cast<T>());
+      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
+      if (max_val == 0) {
+        grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
+      }
+    } else {
+      auto double_range = static_cast<T>((max_val + 1) * 2);
+      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
+      auto is_neg = ((grid_slice_t + static_cast<T>(0.5)) < static_cast<T>(0));
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      auto one_more_flip = (extra > (double_range - extra));
+      auto reflected =
+          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
+      auto clipped = reflected.cwiseMax(static_cast<T>(0))
+                         .cwiseMin(static_cast<T>(max_val));
+      auto in_bound = (clipped == reflected).template cast<T>();
+      grid_scale_t.device(place) =
+          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
+                          (is_neg != one_more_flip).template cast<T>()) *
+          in_bound;
+      grid_slice_t.device(place) = clipped;
+    }
+  }
+}
+
+template <typename T>
+static void CalcGridLocationsWithGrad(const CPUContext& ctx,
+                                      const DenseTensor& grid,
+                                      const int in_h,
+                                      const int in_w,
+                                      bool align_corners,
+                                      std::string padding_mode,
+                                      DenseTensor* grid_x,
+                                      DenseTensor* grid_y,
+                                      DenseTensor* grid_x_scale,
+                                      DenseTensor* grid_y_scale) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+
+  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
+  grid_x->Resize({n, out_h, out_w});
+  grid_y->Resize({n, out_h, out_w});
+  T* grid_x_data = ctx.Alloc<T>(grid_x);
+  T* grid_y_data = ctx.Alloc<T>(grid_y);
+
+  const T* grid_data = grid.data<T>();
+  for (int i = 0; i < n * out_h * out_w; i++) {
+    grid_x_data[i] = grid_data[2 * i];
+    grid_y_data[i] = grid_data[(2 * i) + 1];
+  }
+
+  Unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
+  Unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
+
+  ClipWithMask<T>(
+      ctx, in_w - 1, align_corners, padding_mode, grid_x, grid_x_scale);
+  ClipWithMask<T>(
+      ctx, in_h - 1, align_corners, padding_mode, grid_y, grid_y_scale);
+}
+
+template <typename T>
+static void GatherOutputGradToInputGrad(const DenseTensor& output_grad,
+                                        DenseTensor* input_grad,
+                                        const DenseTensor& x,
+                                        const DenseTensor& y,
+                                        const DenseTensor& d1,
+                                        const DenseTensor& d2) {
+  const int n = output_grad.dims()[0];
+  const int c = output_grad.dims()[1];
+  const int out_h = output_grad.dims()[2];
+  const int out_w = output_grad.dims()[3];
+  const int in_h = input_grad->dims()[2];
+  const int in_w = input_grad->dims()[3];
+  auto x_t = EigenTensor<T, 3>::From(x);
+  auto y_t = EigenTensor<T, 3>::From(y);
+  auto d1_t = EigenTensor<T, 3>::From(d1);
+  auto d2_t = EigenTensor<T, 3>::From(d2);
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+
+  for (int i = 0; i < n; i++) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (IsInBound(
+                x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) {
+          for (int j = 0; j < c; j++) {
+            input_grad_t(i,
+                         j,
+                         static_cast<int>(round(y_t(i, k, l))),
+                         static_cast<int>(round(x_t(i, k, l)))) +=
+                output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void GatherBilinearGrad(const CPUContext& ctx,
+                               const DenseTensor& input,
+                               const DenseTensor& output_grad,
+                               DenseTensor* grid_x,
+                               DenseTensor* grid_y,
+                               DenseTensor* grid_x_scale,
+                               DenseTensor* grid_y_scale,
+                               DenseTensor* input_grad,
+                               DenseTensor* grid_grad) {
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
+  const int c = input.dims()[1];
+
+  DenseTensor x_w, x_e, y_n, y_s;
+  DenseTensor d_w, d_e, d_n, d_s;
+  DenseTensor v_wn, v_en, v_ws, v_es;
+
+  AllNeigbors<T>(ctx,
+                 input,
+                 grid_x,  // grid_x
+                 grid_y,  // grid_y
+                 &x_w,
+                 &x_e,
+                 &y_n,
+                 &y_s,
+                 &d_w,
+                 &d_e,
+                 &d_n,
+                 &d_s,
+                 &v_wn,
+                 &v_en,
+                 &v_ws,
+                 &v_es);
+
+  // gather output grad value to input grad by corner point coords and weight
+  GatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_n, d_e, d_s);
+  GatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_s, d_e, d_n);
+  GatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_n, d_w, d_s);
+  GatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_s, d_w, d_n);
+
+  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
+  auto v_en_t = EigenTensor<T, 4>::From(v_en);
+  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
+  auto v_es_t = EigenTensor<T, 4>::From(v_es);
+
+  auto d_w_t = EigenTensor<T, 3>::From(d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(d_s);
+
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+
+  if (grid_grad != nullptr) {
+    DenseTensor grid_grad_x, grid_grad_y;
+    grid_grad_x.Resize({n, out_h, out_w});
+    grid_grad_y.Resize({n, out_h, out_w});
+    ctx.Alloc<T>(&grid_grad_x);
+    ctx.Alloc<T>(&grid_grad_y);
+    auto grid_grad_x_t =
+        EigenTensor<T, 3>::From(grid_grad_x).setConstant(static_cast<T>(0.0));
+    auto grid_grad_y_t =
+        EigenTensor<T, 3>::From(grid_grad_y).setConstant(static_cast<T>(0.0));
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < c; j++) {
+        for (int k = 0; k < out_h; k++) {
+          for (int l = 0; l < out_w; l++) {
+            grid_grad_x_t(i, k, l) +=
+                ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) +
+                 (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) *
+                output_grad_t(i, j, k, l);
+            grid_grad_y_t(i, k, l) +=
+                ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) +
+                 (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) *
+                output_grad_t(i, j, k, l);
+          }
+        }
+      }
+    }
+
+    //  const T x_max = static_cast<T>(in_w - 1);
+    //  const T y_max = static_cast<T>(in_h - 1);
+
+    auto grid_x_scale_t = EigenTensor<T, 3>::From(*grid_x_scale);
+    auto grid_y_scale_t = EigenTensor<T, 3>::From(*grid_y_scale);
+    grid_grad_x_t = grid_grad_x_t * grid_x_scale_t;
+    grid_grad_y_t = grid_grad_y_t * grid_y_scale_t;
+
+    // gather grid_grad [x, y] in 3rd Dim
+    T* grid_grad_data = grid_grad->data<T>();
+    T* grid_grad_x_data = grid_grad_x.data<T>();
+    T* grid_grad_y_data = grid_grad_y.data<T>();
+    for (int i = 0; i < n * out_h * out_w; i++) {
+      grid_grad_data[2 * i] = grid_grad_x_data[i];
+      grid_grad_data[2 * i + 1] = grid_grad_y_data[i];
+    }
+  }
+}
+
+template <typename T>
+static void GatherOutputGradToInputGrad(const DenseTensor& output_grad,
+                                        DenseTensor* input_grad,
+                                        const DenseTensor& x,
+                                        const DenseTensor& y) {
+  const int n = output_grad.dims()[0];
+  const int c = output_grad.dims()[1];
+  const int out_h = output_grad.dims()[2];
+  const int out_w = output_grad.dims()[3];
+  const int in_h = input_grad->dims()[2];
+  const int in_w = input_grad->dims()[3];
+  auto x_t = EigenTensor<T, 3>::From(x);
+  auto y_t = EigenTensor<T, 3>::From(y);
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+  for (int i = 0; i < n; i++) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (IsInBound(
+                x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) {
+          for (int j = 0; j < c; j++) {
+            input_grad_t(i,
+                         j,
+                         static_cast<int>(round(y_t(i, k, l))),
+                         static_cast<int>(round(x_t(i, k, l)))) +=
+                output_grad_t(i, j, k, l);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void GridSampleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& grid,
+                          const DenseTensor& out_grid,
+                          const std::string& mode,
+                          const std::string& padding_mode,
+                          bool align_corners,
+                          DenseTensor* x_grad,
+                          DenseTensor* grid_grad) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+  const int c = x.dims()[1];
+  const int in_h = x.dims()[2];
+  const int in_w = x.dims()[3];
+
+  x_grad->Resize({n, c, in_h, in_w});
+  dev_ctx.template Alloc<T>(x_grad);
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
+
+  if (grid_grad != nullptr) {
+    grid_grad->Resize({n, out_h, out_w, 2});
+    dev_ctx.template Alloc<T>(grid_grad);
+    phi::funcs::SetConstant<Context, T>()(
+        dev_ctx, grid_grad, static_cast<T>(0));
+  }
+
+  DenseTensor grid_x, grid_y;
+  DenseTensor grid_x_scale, grid_y_scale;
+  CalcGridLocationsWithGrad<T>(dev_ctx,
+                               grid,
+                               in_h,
+                               in_w,
+                               align_corners,
+                               padding_mode,
+                               &grid_x,
+                               &grid_y,
+                               &grid_x_scale,
+                               &grid_y_scale);
+  if (mode == "bilinear") {
+    GatherBilinearGrad<T>(dev_ctx,
+                          x,
+                          out_grid,
+                          &grid_x,
+                          &grid_y,
+                          &grid_x_scale,
+                          &grid_y_scale,
+                          x_grad,
+                          grid_grad);
+  } else {
+    auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
+    auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
+    grid_x_t = grid_x_t.round();
+    grid_y_t = grid_y_t.round();
+    GatherOutputGradToInputGrad<T>(out_grid, x_grad, grid_x, grid_y);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(grid_sample_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GridSampleGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/grid_sample_kernel.cc b/paddle/phi/kernels/cpu/grid_sample_kernel.cc
new file mode 100644
index 0000000000000..92a528cdda96a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/grid_sample_kernel.cc
@@ -0,0 +1,184 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/grid_sample_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/grid_sample_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using Array4 = Eigen::DSizes<int64_t, 4>;
+
+template <typename T>
+static inline void Clip(const CPUContext& ctx,
+                        DenseTensor* grid_slice,
+                        const int max_val,  // height-1 or width-1
+                        bool align_corners,
+                        std::string padding_mode) {
+  auto& place = *ctx.eigen_device();
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+  if (padding_mode == "border") {
+    grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
+                                     .cwiseMin(static_cast<T>(max_val));
+  } else if (padding_mode == "reflection") {
+    if (align_corners) {
+      auto double_range = static_cast<T>(max_val * 2);
+      auto grid_abs = grid_slice_t.abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
+      if (max_val == 0) {
+        grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
+      }
+    } else {
+      auto double_range = static_cast<T>((max_val + 1) * 2);
+      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      grid_slice_t.device(place) =
+          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
+      grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
+                                       .cwiseMin(static_cast<T>(max_val));
+    }
+  }
+}
+
+template <typename T>
+static void CalcGridLocations(const CPUContext& ctx,
+                              const DenseTensor& grid,
+                              const int in_h,
+                              const int in_w,
+                              bool align_corners,
+                              std::string padding_mode,
+                              DenseTensor* grid_x,
+                              DenseTensor* grid_y) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+
+  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
+  grid_x->Resize({n, out_h, out_w});
+  grid_y->Resize({n, out_h, out_w});
+  T* grid_x_data = ctx.Alloc<T>(grid_x);
+  T* grid_y_data = ctx.Alloc<T>(grid_y);
+  const T* grid_data = grid.data<T>();
+  for (int i = 0; i < n * out_h * out_w; i++) {
+    grid_x_data[i] = grid_data[2 * i];
+    grid_y_data[i] = grid_data[(2 * i) + 1];
+  }
+
+  Unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
+  Unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
+
+  Clip<T>(ctx, grid_x, in_w - 1, align_corners, padding_mode);
+  Clip<T>(ctx, grid_y, in_h - 1, align_corners, padding_mode);
+}
+
+template <typename T>
+static void BilinearInter(const CPUContext& ctx,
+                          const DenseTensor& input,
+                          DenseTensor* grid_x,
+                          DenseTensor* grid_y,
+                          DenseTensor* out) {
+  auto& place = *ctx.eigen_device();
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
+  const int c = input.dims()[1];
+
+  DenseTensor x_w, x_e, y_n, y_s;
+  DenseTensor d_w, d_e, d_n, d_s;
+  DenseTensor v_wn, v_en, v_ws, v_es;
+
+  AllNeigbors<T>(ctx,
+                 input,
+                 grid_x,
+                 grid_y,
+                 &x_w,
+                 &x_e,
+                 &y_n,
+                 &y_s,
+                 &d_w,
+                 &d_e,
+                 &d_n,
+                 &d_s,
+                 &v_wn,
+                 &v_en,
+                 &v_ws,
+                 &v_es);
+
+  auto d_w_t = EigenTensor<T, 3>::From(d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(d_s);
+
+  auto d_w_scaled_t =
+      d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_e_scaled_t =
+      d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_n_scaled_t =
+      d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_s_scaled_t =
+      d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
+  auto v_en_t = EigenTensor<T, 4>::From(v_en);
+  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
+  auto v_es_t = EigenTensor<T, 4>::From(v_es);
+  auto output_t = EigenTensor<T, 4>::From(*out);
+  // bilinear interpolaetion by 4 corner points
+  output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
+                           v_en_t * d_w_scaled_t * d_s_scaled_t +
+                           v_ws_t * d_e_scaled_t * d_n_scaled_t +
+                           v_es_t * d_w_scaled_t * d_n_scaled_t;
+}
+
+template <typename T, typename Context>
+void GridSampleKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& grid,
+                      const std::string& mode,
+                      const std::string& padding_mode,
+                      bool align_corners,
+                      DenseTensor* out) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+  const int c = x.dims()[1];
+  const int in_h = x.dims()[2];
+  const int in_w = x.dims()[3];
+
+  out->Resize(phi::make_ddim({n, c, out_h, out_w}));
+  dev_ctx.template Alloc<T>(out);
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, out, static_cast<T>(0));
+
+  DenseTensor grid_x, grid_y;
+  CalcGridLocations<T>(
+      dev_ctx, grid, in_h, in_w, align_corners, padding_mode, &grid_x, &grid_y);
+
+  if (mode == "bilinear") {
+    BilinearInter<T>(dev_ctx, x, &grid_x, &grid_y, out);
+  } else if (mode == "nearest") {
+    auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
+    auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
+    grid_x_t = grid_x_t.round();
+    grid_y_t = grid_y_t.round();
+    GetGridPointValue<T>(x, out, grid_x, grid_y);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    grid_sample, CPU, ALL_LAYOUT, phi::GridSampleKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/grid_sample_utils.h b/paddle/phi/kernels/cpu/grid_sample_utils.h
new file mode 100644
index 0000000000000..53a16446d7e8c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/grid_sample_utils.h
@@ -0,0 +1,160 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T>
+void Unnormalize(const CPUContext& ctx,
+                 DenseTensor* grid_slice,
+                 const int max_val,  // height-1 or width-1
+                 bool align_corners) {
+  auto& place = *ctx.eigen_device();
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+
+  if (!align_corners) {
+    auto factor = static_cast<T>((max_val + 1) * 0.5);
+    grid_slice_t.device(place) =
+        (grid_slice_t + static_cast<T>(1)) * factor - static_cast<T>(0.5);
+  } else {
+    auto factor = static_cast<T>(max_val * 0.5);
+    grid_slice_t.device(place) = (grid_slice_t + static_cast<T>(1)) * factor;
+  }
+}
+
+template <typename T>
+inline bool IsInBound(T x, T y, T x_max, T y_max) {
+  if (x < 0 || x > x_max || y < 0 || y > y_max) {
+    return false;
+  }
+  return true;
+}
+
+template <typename T>
+void GetGridPointValue(const DenseTensor& input,
+                       DenseTensor* output,
+                       const DenseTensor& x,
+                       const DenseTensor& y) {
+  const int n = input.dims()[0];
+  const int c = input.dims()[1];
+  const int in_h = input.dims()[2];
+  const int in_w = input.dims()[3];
+  const int out_h = x.dims()[1];
+  const int out_w = x.dims()[2];
+  auto x_t = EigenTensor<T, 3>::From(x);
+  auto y_t = EigenTensor<T, 3>::From(y);
+  auto output_t = EigenTensor<T, 4>::From(*output).setConstant((T)0);
+  auto input_t = EigenTensor<T, 4>::From(input);
+
+  for (int i = 0; i < n; i++) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (IsInBound(
+                x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) {
+          for (int j = 0; j < c; j++) {
+            output_t(i, j, k, l) =
+                input_t(i,
+                        j,
+                        static_cast<int>(round(y_t(i, k, l))),
+                        static_cast<int>(round(x_t(i, k, l))));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void AllNeigbors(const CPUContext& ctx,
+                 const DenseTensor& input,
+                 DenseTensor* grid_x,
+                 DenseTensor* grid_y,
+                 DenseTensor* x_w,
+                 DenseTensor* x_e,
+                 DenseTensor* y_n,
+                 DenseTensor* y_s,  // positions
+                 DenseTensor* d_w,
+                 DenseTensor* d_e,
+                 DenseTensor* d_n,
+                 DenseTensor* d_s,  // distance
+                 DenseTensor* v_wn,
+                 DenseTensor* v_en,
+                 DenseTensor* v_ws,
+                 DenseTensor* v_es) {  // values
+  auto& place = *ctx.eigen_device();
+
+  const int c = input.dims()[1];
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
+  // calculate coords of 4 corner points
+  x_w->Resize({n, out_h, out_w});
+  x_e->Resize({n, out_h, out_w});
+  y_n->Resize({n, out_h, out_w});
+  y_s->Resize({n, out_h, out_w});
+  ctx.Alloc<T>(x_w);
+  ctx.Alloc<T>(x_e);
+  ctx.Alloc<T>(y_n);
+  ctx.Alloc<T>(y_s);
+  auto x_w_t = EigenTensor<T, 3>::From(*x_w);
+  auto x_e_t = EigenTensor<T, 3>::From(*x_e);
+  auto y_n_t = EigenTensor<T, 3>::From(*y_n);
+  auto y_s_t = EigenTensor<T, 3>::From(*y_s);
+
+  auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
+  auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
+
+  x_w_t.device(place) = grid_x_t.floor();
+  x_e_t.device(place) = x_w_t + static_cast<T>(1);
+  y_n_t.device(place) = grid_y_t.floor();
+  y_s_t.device(place) = y_n_t + static_cast<T>(1);
+
+  // calculate distances to 4 sides
+  d_w->Resize({n, out_h, out_w});
+  d_e->Resize({n, out_h, out_w});
+  d_n->Resize({n, out_h, out_w});
+  d_s->Resize({n, out_h, out_w});
+  ctx.Alloc<T>(d_w);
+  ctx.Alloc<T>(d_e);
+  ctx.Alloc<T>(d_n);
+  ctx.Alloc<T>(d_s);
+  auto d_w_t = EigenTensor<T, 3>::From(*d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(*d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(*d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(*d_s);
+  d_w_t.device(place) = grid_x_t - x_w_t;
+  d_e_t.device(place) = x_e_t - grid_x_t;
+  d_n_t.device(place) = grid_y_t - y_n_t;
+  d_s_t.device(place) = y_s_t - grid_y_t;
+
+  // calc 4 corner points value
+  v_wn->Resize({n, c, out_h, out_w});
+  v_en->Resize({n, c, out_h, out_w});
+  v_ws->Resize({n, c, out_h, out_w});
+  v_es->Resize({n, c, out_h, out_w});
+  ctx.Alloc<T>(v_wn);
+  ctx.Alloc<T>(v_en);
+  ctx.Alloc<T>(v_ws);
+  ctx.Alloc<T>(v_es);
+  GetGridPointValue<T>(input, v_wn, *x_w, *y_n);
+  GetGridPointValue<T>(input, v_en, *x_e, *y_n);
+  GetGridPointValue<T>(input, v_ws, *x_w, *y_s);
+  GetGridPointValue<T>(input, v_es, *x_e, *y_s);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h
new file mode 100644
index 0000000000000..b79aab96c0fc2
--- /dev/null
+++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h
@@ -0,0 +1,110 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/math/matrix_bit_code.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+namespace math = paddle::operators::math;
+
+template <typename T, typename Context>
+void HierarchicalSigmoidGradKernelImpl(
+    const Context& ctx,
+    const DenseTensor& x,
+    const DenseTensor& w,
+    const DenseTensor& label,
+    const DenseTensor& pre_out,
+    const DenseTensor& out_grad,
+    paddle::optional<const DenseTensor&> path,
+    paddle::optional<const DenseTensor&> code,
+    paddle::optional<const DenseTensor&> bias,
+    int num_classes,
+    bool remote_prefetch,
+    int trainer_id,
+    const std::vector<int64_t>& height_sections,
+    const std::vector<std::string>& epmap,
+    const std::vector<std::string>& table_names,
+    bool is_sparse,
+    DenseTensor* x_grad,
+    DenseTensor* w_grad,
+    DenseTensor* bias_grad,
+    SelectedRows* w_grad_sr = nullptr) {
+  funcs::SetConstant<Context, T> zero;
+  DenseTensor pre_out_grad;
+
+  pre_out_grad.Resize(pre_out.dims());
+  ctx.template Alloc<T>(&pre_out_grad);
+  ctx.template Alloc<T>(x_grad);
+  zero(ctx, x_grad, static_cast<T>(0.0));
+
+  bool is_custom = false;
+  if (path.get_ptr()) {
+    is_custom = true;
+  }
+
+  std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
+  if (!is_custom) {
+    bit_code.reset(new math::MatrixBitCodeFunctor<T>(
+        num_classes, label.template data<int64_t>()));
+  } else {
+    bit_code.reset(new math::MatrixBitCodeFunctor<T>(
+        *(path.get_ptr()), *(code.get_ptr()), label.template data<int64_t>()));
+  }
+
+  // softrelu derivative
+
+  auto blas = funcs::GetBlas<Context, T>(ctx);
+
+  auto* pre_out_grad_data = pre_out_grad.data<T>();
+  auto* pre_out_data = pre_out.template data<T>();
+  auto n = pre_out.numel();
+  blas.VEXP(n, pre_out_data, pre_out_grad_data);
+  blas.VINV(n, pre_out_grad_data, pre_out_grad_data);
+  for (int64_t i = 0; i < n; ++i) {
+    pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i];
+  }
+  bit_code->Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
+  auto* out_grad_data = out_grad.template data<T>();
+
+  int64_t dim0 = pre_out_grad.dims()[0];
+  int64_t dim1 = pre_out_grad.dims()[1];
+  for (int64_t i = 0; i < dim0; ++i) {
+    T tmp = out_grad_data[i];
+    blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1);
+  }
+  // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
+  // be consistent with the clipping in forward.
+  if (bias_grad) {
+    ctx.template Alloc<T>(bias_grad);
+    zero(ctx, bias_grad, static_cast<T>(0.0));
+    bit_code->AddGrad(pre_out_grad, bias_grad);
+  }
+  ctx.template Alloc<T>(w_grad);
+  zero(ctx, w_grad, static_cast<T>(0.0));
+  if (!is_sparse) {
+    bit_code->MulGradWeight(pre_out_grad, w_grad, x);
+  } else {
+    bit_code->MulGradWeight(pre_out_grad, w_grad_sr, x);
+  }
+  bit_code->MulGradError(pre_out_grad, w, x_grad);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc
new file mode 100644
index 0000000000000..f64a1a8162a37
--- /dev/null
+++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void HierarchicalSigmoidGradKernel(const Context& ctx,
+                                   const DenseTensor& x,
+                                   const DenseTensor& w,
+                                   const DenseTensor& label,
+                                   const DenseTensor& pre_out,
+                                   const DenseTensor& out_grad,
+                                   paddle::optional<const DenseTensor&> path,
+                                   paddle::optional<const DenseTensor&> code,
+                                   paddle::optional<const DenseTensor&> bias,
+                                   int num_classes,
+                                   bool remote_prefetch,
+                                   int trainer_id,
+                                   const std::vector<int64_t>& height_sections,
+                                   const std::vector<std::string>& epmap,
+                                   const std::vector<std::string>& table_names,
+                                   bool is_sparse,
+                                   DenseTensor* x_grad,
+                                   DenseTensor* w_grad,
+                                   DenseTensor* bias_grad) {
+  HierarchicalSigmoidGradKernelImpl<T>(ctx,
+                                       x,
+                                       w,
+                                       label,
+                                       pre_out,
+                                       out_grad,
+                                       path,
+                                       code,
+                                       bias,
+                                       num_classes,
+                                       remote_prefetch,
+                                       trainer_id,
+                                       height_sections,
+                                       epmap,
+                                       table_names,
+                                       is_sparse,
+                                       x_grad,
+                                       w_grad,
+                                       bias_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(hierarchical_sigmoid_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::HierarchicalSigmoidGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc b/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc
new file mode 100644
index 0000000000000..096a54f9fb263
--- /dev/null
+++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/hierarchical_sigmoid_kernel.h"
+
+#include "paddle/fluid/operators/clip_op.h"
+#include "paddle/fluid/operators/math/matrix_bit_code.h"
+#include "paddle/fluid/platform/transform.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function_impl.h"
+
+namespace phi {
+
+namespace math = paddle::operators::math;
+
+template <typename T, typename Context>
+void HierarchicalSigmoidKernel(const Context& ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& w,
+                               const DenseTensor& label,
+                               paddle::optional<const DenseTensor&> path,
+                               paddle::optional<const DenseTensor&> code,
+                               paddle::optional<const DenseTensor&> bias,
+                               int num_classes,
+                               bool remote_prefetch,
+                               int trainer_id,
+                               const std::vector<int64_t>& height_sections,
+                               const std::vector<std::string>& epmap,
+                               const std::vector<std::string>& table_names,
+                               bool is_sparse,
+                               DenseTensor* out,
+                               DenseTensor* pre_out,
+                               DenseTensor* w_out) {
+  size_t num_classes_st = static_cast<size_t>(num_classes);
+  // for remote prefetch
+
+  bool is_custom = false;
+  if (path.get_ptr()) {
+    is_custom = true;
+  }
+  int64_t code_length = path.get_ptr() ? path.get_ptr()->dims()[1]
+                                       : math::FindLastSet(num_classes_st - 1);
+  int64_t batch_size = x.dims()[0];
+  DenseTensor sum;
+  pre_out->Resize(phi::make_ddim({batch_size, code_length}));
+  ctx.template Alloc<T>(pre_out);
+  auto* pre_out_data = pre_out->data<T>();
+  auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
+  // Not all class(leaf) nodes' path lengths equal code_length, thus init as
+  // 0s can avoid out of path's loss.
+  funcs::SetConstant<Context, T> zero;
+  zero(ctx, pre_out, static_cast<T>(0.0));
+  auto& place = *ctx.eigen_device();
+  funcs::RowwiseSum<Context, T> row_sum;
+
+  std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
+  if (!is_custom) {
+    bit_code.reset(new math::MatrixBitCodeFunctor<T>(
+        num_classes_st, label.template data<int64_t>()));
+  } else {
+    bit_code.reset(new math::MatrixBitCodeFunctor<T>(
+        *(path.get_ptr()), *(code.get_ptr()), label.template data<int64_t>()));
+  }
+
+  std::vector<int64_t> sum_dims({batch_size, 1UL});
+  sum.Resize(phi::make_ddim(sum_dims));
+  ctx.template Alloc<T>(&sum);
+  auto sum_mat = EigenMatrix<T>::From(sum);
+  ctx.template Alloc<T>(out);
+  auto out_mat = EigenMatrix<T>::From(*out);
+  if (bias.get_ptr()) {
+    bit_code->Add(*(bias.get_ptr()), pre_out);
+  }
+  bit_code->Mul(pre_out, w, x);
+  // clip to [-40, 40]
+  paddle::platform::Transform<Context> trans;
+  trans(ctx,
+        pre_out_data,
+        pre_out_data + pre_out->numel(),
+        pre_out_data,
+        paddle::operators::ClipFunctor<T>(static_cast<T>(-40.0),
+                                          static_cast<T>(40.0)));
+  bit_code->Sum(*pre_out, out, static_cast<T>(-1));
+  // use softrelu to calculate cross entropy
+  pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
+  row_sum(ctx, *pre_out, &sum);
+  // TODO(guosheng): Subtract the out of path's loss, since not all
+  // class(leaf) nodes' path lengths equal code_length. But it won't break the
+  // gradient check since both have the out of path's loss and will cancel out
+  // each other.
+  out_mat.device(place) = sum_mat + out_mat;
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(hierarchical_sigmoid,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::HierarchicalSigmoidKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/index_sample_kernel.cc b/paddle/phi/kernels/cpu/index_sample_kernel.cc
index 21bf9faee13cf..b895e4aa7c0e7 100644
--- a/paddle/phi/kernels/cpu/index_sample_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_sample_kernel.cc
@@ -41,7 +41,7 @@ void IndexSampleInner(const Context &context,
   std::vector<T> input_vec;
   std::vector<IndexT> index_vec;
   paddle::framework::TensorToVector(input, context, &input_vec);
-  paddle::framework::TensorToVector(index, context, &index_vec);
+  paddle::framework::TensorToVector<IndexT>(index, context, &index_vec);
 
   std::vector<T> res(index_ids_num);
   for (int i = 0; i < index_ids_num; i++) {
diff --git a/paddle/phi/kernels/cpu/index_select_grad_kernel.cc b/paddle/phi/kernels/cpu/index_select_grad_kernel.cc
new file mode 100644
index 0000000000000..9dd50e7df8f06
--- /dev/null
+++ b/paddle/phi/kernels/cpu/index_select_grad_kernel.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_select_grad_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/cpu/index_select_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IndexSelectGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& index,
+                           const DenseTensor& out_grad,
+                           int dim,
+                           DenseTensor* x_grad) {
+  if (dim < 0) {
+    dim += out_grad.dims().size();
+  }
+  const auto& index_type = index.dtype();
+
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input(Index) holds the wrong type, it holds %s, but "
+                        "desires to be %s or %s",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  if (index_type == phi::DataType::INT32) {
+    IndexSelectGradInner<Context, T, int>(ctx, out_grad, index, x_grad, dim);
+  } else if (index_type == phi::DataType::INT64) {
+    IndexSelectGradInner<Context, T, int64_t>(
+        ctx, out_grad, index, x_grad, dim);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_select_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IndexSelectGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/index_select_impl.h b/paddle/phi/kernels/cpu/index_select_impl.h
new file mode 100644
index 0000000000000..163174580ff78
--- /dev/null
+++ b/paddle/phi/kernels/cpu/index_select_impl.h
@@ -0,0 +1,178 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename Context, typename T, class Enable = void>
+struct IndexSelectAdd {
+  void operator()(const Context& ctx,
+                  int slice_size,
+                  const T* src_pointer,
+                  const T* p_pointer,
+                  T* dist_pointer) {
+    for (int i = 0; i < slice_size; i++) {
+      dist_pointer[i] = src_pointer[i] + p_pointer[i];
+    }
+  }
+};
+
+template <typename Context, typename T>
+struct IndexSelectAdd<
+    Context,
+    T,
+    typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  void operator()(const Context& ctx,
+                  int slice_size,
+                  const T* src_pointer,
+                  const T* p_pointer,
+                  T* dist_pointer) {
+    auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+    blas.VADD(slice_size, src_pointer, p_pointer, dist_pointer);
+  }
+};
+
+template <typename Context, typename T, typename IndexT = int>
+void IndexSelectInner(const Context& ctx,
+                      DenseTensor* input,
+                      const DenseTensor& index,
+                      DenseTensor* output,
+                      int dim) {
+  auto input_dim = input->dims();
+  auto input_dim_size = input_dim.size();
+  auto output_dim = output->dims();
+  auto index_size = index.dims()[0];
+
+  DenseTensor index_cpu_copy;
+  if (!paddle::platform::is_cpu_place(index.place())) {
+    phi::Copy(ctx, index, phi::CPUPlace(), true, &index_cpu_copy);
+  }
+  const IndexT* index_data = paddle::platform::is_cpu_place(index.place())
+                                 ? index.data<IndexT>()
+                                 : index_cpu_copy.data<IndexT>();
+  ctx.template Alloc<T>(output);
+
+  auto slice_size = 1;
+  for (auto i = dim + 1; i < input_dim_size; i++) {
+    slice_size *= input_dim[i];
+  }
+
+  auto outer_nums = 1;
+  for (auto i = 0; i < dim; i++) {
+    outer_nums *= input_dim[i];
+  }
+
+  for (int i = 0; i < index_size; i++) {
+    PADDLE_ENFORCE_GE(
+        index_data[i],
+        0,
+        phi::errors::InvalidArgument(
+            "Variable value (index) of OP(index_select) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            input_dim[dim],
+            index_data[i]));
+    PADDLE_ENFORCE_LT(
+        index_data[i],
+        input_dim[dim],
+        phi::errors::InvalidArgument(
+            "Variable value (index) of OP(index_select) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            input_dim[dim],
+            index_data[i]));
+  }
+
+  VLOG(3) << "Index_Select_Debug; outer_nums: " << outer_nums
+          << "; slice_size: " << slice_size << "; index_size: " << index_size;
+
+  input->Resize(phi::make_ddim({outer_nums, input_dim[dim], slice_size}));
+  output->Resize(phi::make_ddim({outer_nums, index_size, slice_size}));
+
+  auto input_tensor = EigenTensor<T, 3>::From(*input);
+  auto output_tensor = EigenTensor<T, 3>::From(*output);
+
+  auto& place = *ctx.eigen_device();
+
+  for (auto j = 0; j < index_size; j++) {
+    IndexT index_value = index_data[j];
+    auto output_t = output_tensor.chip(j, 1);
+    output_t.device(place) = input_tensor.chip(index_value, 1);
+  }
+  input->Resize(input_dim);
+  output->Resize(output_dim);
+}
+
+template <typename Context, typename T, typename IndexT = int>
+void IndexSelectGradInner(const Context& ctx,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& index,
+                          DenseTensor* x_grad,
+                          int dim) {
+  const T* input_data = out_grad.data<T>();
+  const IndexT* index_data = index.data<IndexT>();
+
+  const T* p_output = ctx.template Alloc<T>(x_grad);
+  T* out_data = ctx.template Alloc<T>(x_grad);
+
+  auto input_dim = out_grad.dims();
+  auto input_dim_size = input_dim.size();
+  auto output_dim = x_grad->dims();
+
+  phi::funcs::SetConstant<Context, T> set_constant;
+  set_constant(ctx, x_grad, static_cast<T>(0.0));
+
+  auto slice_size = 1;
+  for (auto i = dim + 1; i < input_dim_size; i++) {
+    slice_size *= input_dim[i];
+  }
+
+  auto input_width = slice_size * input_dim[dim];
+  auto output_width = slice_size * output_dim[dim];
+
+  auto outer_nums = 1;
+  for (auto i = 0; i < dim; i++) {
+    outer_nums *= input_dim[i];
+  }
+
+  auto index_size = index.dims()[0];
+  VLOG(3) << "Index_Select_Grad_Debug; outer_nums: " << outer_nums
+          << "; slice_size: " << slice_size << "; input_width: " << input_width
+          << "; output_width: " << output_width
+          << "; index_size: " << index_size;
+
+  for (auto i = 0; i < outer_nums; i++) {
+    auto input_start_offset = i * input_width;
+    auto output_start_offset = i * output_width;
+
+    for (auto j = 0; j < index_size; j++) {
+      IndexT index_value = index_data[j];
+      auto src = input_data + input_start_offset + j * slice_size;
+      auto p_out = p_output + output_start_offset + index_value * slice_size;
+      auto dst = out_data + output_start_offset + index_value * slice_size;
+      IndexSelectAdd<Context, T> index_select_add;
+      index_select_add(ctx, slice_size, src, p_out, dst);
+    }
+  }
+  x_grad->Resize(output_dim);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/index_select_kernel.cc b/paddle/phi/kernels/cpu/index_select_kernel.cc
new file mode 100644
index 0000000000000..5341ede6b2fd8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/index_select_kernel.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_select_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/cpu/index_select_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IndexSelectKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& index,
+                       int dim,
+                       DenseTensor* output) {
+  auto inputs = x;
+  if (dim < 0) {
+    dim += inputs.dims().size();
+  }
+  const auto& index_type = index.dtype();
+
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input(Index) holds the wrong type, it holds %s, but "
+                        "desires to be %s or %s",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  if (index_type == phi::DataType::INT32) {
+    IndexSelectInner<Context, T, int>(ctx, &inputs, index, output, dim);
+  } else if (index_type == phi::DataType::INT64) {
+    IndexSelectInner<Context, T, int64_t>(ctx, &inputs, index, output, dim);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_select,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IndexSelectKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/isclose_kernel.cc b/paddle/phi/kernels/cpu/isclose_kernel.cc
new file mode 100644
index 0000000000000..633c6ba093e42
--- /dev/null
+++ b/paddle/phi/kernels/cpu/isclose_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/isclose_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/isclose_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    isclose, CPU, ALL_LAYOUT, phi::IscloseKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc
new file mode 100644
index 0000000000000..f9399d38d711f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kldiv_loss_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    kldiv_loss_grad, CPU, ALL_LAYOUT, phi::KLDivLossGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc b/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc
new file mode 100644
index 0000000000000..c462b8ec32c89
--- /dev/null
+++ b/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kldiv_loss_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h"
+
+namespace phi {}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    kldiv_loss, CPU, ALL_LAYOUT, phi::KLDivLossKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/kron_grad_kernel.cc b/paddle/phi/kernels/cpu/kron_grad_kernel.cc
new file mode 100644
index 0000000000000..01f5e5404b61d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/kron_grad_kernel.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kron_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kron_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(kron_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::KronGradKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/kron_kernel.cc b/paddle/phi/kernels/cpu/kron_kernel.cc
new file mode 100644
index 0000000000000..aaea509dc7641
--- /dev/null
+++ b/paddle/phi/kernels/cpu/kron_kernel.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kron_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kron_kernel_impl.h"
+
+PD_REGISTER_KERNEL(kron,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::KronKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc
new file mode 100644
index 0000000000000..185d6cbedc85d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc
@@ -0,0 +1,168 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kthvalue_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+template <typename T, typename Type>
+static void kthvalueAssign(const Type& input_height,
+                           const Type& input_width,
+                           const int& input_dim,
+                           const DenseTensor* input,
+                           const DenseTensor* indices,
+                           T* output_data) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      output_data[i * input_width + e_indices(0)] = e_input(0);
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void KthvalueGradKernel(const Context& dev_ctx,
+                        const DenseTensor& d_out,
+                        const DenseTensor& x,
+                        const DenseTensor& indices,
+                        int k,
+                        int axis,
+                        bool keepdim,
+                        DenseTensor* d_x) {
+  auto in_dims = x.dims();
+  auto out_dims = indices.dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  if (!keepdim) {
+    std::vector<int> tmp_out_shape;
+    for (int i = 0; i < axis; i++) {
+      tmp_out_shape.emplace_back(out_dims[i]);
+    }
+    tmp_out_shape.emplace_back(1);
+    for (int i = axis + 1; i < in_dims.size(); i++) {
+      tmp_out_shape.emplace_back(out_dims[i - 1]);
+    }
+    out_dims = phi::make_ddim(tmp_out_shape);
+  }
+  T* x_grad_data = dev_ctx.template Alloc<T>(d_x);
+  if (axis == in_dims.size() - 1) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+    memset(x_grad_data, 0, d_x->numel() * sizeof(T));
+    if (keepdim) {
+      kthvalueAssign(input_height,
+                     input_width,
+                     in_dims.size(),
+                     &d_out,
+                     &indices,
+                     x_grad_data);
+    } else {
+      DenseTensor out_grad_tmp, indices_tmp;
+      out_grad_tmp.Resize(d_out.dims());
+      indices_tmp.Resize(indices.dims());
+      dev_ctx.template Alloc<T>(&out_grad_tmp);
+      dev_ctx.template Alloc<int64_t>(&indices_tmp);
+      Copy(dev_ctx, d_out, dev_ctx.GetPlace(), false, &out_grad_tmp);
+      Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
+      out_grad_tmp.Resize(out_dims);
+      indices_tmp.Resize(out_dims);
+      kthvalueAssign(input_height,
+                     input_width,
+                     in_dims.size(),
+                     &out_grad_tmp,
+                     &indices_tmp,
+                     x_grad_data);
+    }
+  } else {
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(out_dims.size() - 1);
+    for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+    DDim trans_dims(out_dims);
+    DDim trans_in_dims(in_dims);
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = out_dims[trans[i]];
+      trans_in_dims[i] = in_dims[trans[i]];
+    }
+    DenseTensor trans_dO, trans_ind;
+    trans_dO.Resize(trans_dims);
+    trans_ind.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_dO);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    int ndims = trans.size();
+    if (keepdim) {
+      funcs::TransCompute<phi::CPUContext, T>(
+          ndims, dev_ctx, d_out, &trans_dO, trans);
+      funcs::TransCompute<phi::CPUContext, int64_t>(
+          ndims, dev_ctx, indices, &trans_ind, trans);
+    } else {
+      DenseTensor out_grad_tmp, indices_tmp;
+      out_grad_tmp.Resize(d_out.dims());
+      indices_tmp.Resize(indices.dims());
+      dev_ctx.template Alloc<T>(&out_grad_tmp);
+      dev_ctx.template Alloc<int64_t>(&indices_tmp);
+      Copy(dev_ctx, d_out, dev_ctx.GetPlace(), false, &out_grad_tmp);
+      Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
+      out_grad_tmp.Resize(out_dims);
+      indices_tmp.Resize(out_dims);
+      funcs::TransCompute<phi::CPUContext, T>(
+          ndims, dev_ctx, out_grad_tmp, &trans_dO, trans);
+      funcs::TransCompute<phi::CPUContext, int64_t>(
+          ndims, dev_ctx, indices_tmp, &trans_ind, trans);
+    }
+    const int64_t input_height = phi::product(
+        phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
+    const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_in_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    memset(t_out, 0, d_x->numel() * sizeof(T));
+    kthvalueAssign<T, int64_t>(input_height,
+                               input_width,
+                               in_dims.size(),
+                               &trans_dO,
+                               &trans_ind,
+                               t_out);
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, tmp_out, d_x, trans);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(kthvalue_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::KthvalueGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/kthvalue_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_kernel.cc
new file mode 100644
index 0000000000000..5e436623cae7b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/kthvalue_kernel.cc
@@ -0,0 +1,167 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kthvalue_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+template <typename T, typename Type>
+static void getKthvalue(Type input_height,
+                        Type input_width,
+                        int input_dim,
+                        const DenseTensor* input,
+                        T* t_out,
+                        Type* t_indices,
+                        const int& k) {
+  bool partial_sort_flag = (k * 64) < input_width;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    if (partial_sort_flag) {
+      std::partial_sort(
+          col_vec.begin(),
+          col_vec.begin() + k,
+          col_vec.end(),
+          [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+            return (!std::isnan(static_cast<double>(l.first)) &&
+                    std::isnan(static_cast<double>(r.first))) ||
+                   (l.first < r.first);
+          });
+    } else {
+      std::nth_element(
+          col_vec.begin(),
+          col_vec.begin() + k - 1,
+          col_vec.end(),
+          [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+            return (!std::isnan(static_cast<double>(l.first)) &&
+                    std::isnan(static_cast<double>(r.first))) ||
+                   (l.first < r.first);
+          });
+    }
+    t_out[i] = col_vec[k - 1].first;
+    t_indices[i] = col_vec[k - 1].second;
+  }
+}
+
+template <typename T, typename Context>
+void KthvalueKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    int k,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* output,
+                    DenseTensor* indices) {
+  const auto& in_dims = x.dims();
+  if (axis < 0) axis += in_dims.size();
+  T* output_data = dev_ctx.template Alloc<T>(output);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+  auto out_dims = output->dims();
+  if (axis == in_dims.size() - 1) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    getKthvalue<T, int64_t>(input_height,
+                            input_width,
+                            in_dims.size(),
+                            &x,
+                            output_data,
+                            indices_data,
+                            k);
+  } else {
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+    if (!keepdim) {
+      std::vector<int> tmp_out_shape;
+      for (int i = 0; i < axis; i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      tmp_out_shape.emplace_back(1);
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
+      output->Resize(tmp_out_dims);
+      indices->Resize(tmp_out_dims);
+    }
+    DDim trans_dims(in_dims);
+    DDim trans_out_dims(in_dims);
+
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+      trans_out_dims[i] = in_dims[trans[i]];
+    }
+    trans_out_dims[in_dims.size() - 1] = 1;
+    DenseTensor trans_inp;
+    trans_inp.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_inp);
+    int ndims = trans.size();
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, x, &trans_inp, trans);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+    DenseTensor tmp_out, tmp_indices;
+    tmp_out.Resize(trans_out_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    tmp_indices.Resize(trans_out_dims);
+    int64_t* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
+    getKthvalue<T, int64_t>(
+        input_height, input_width, in_dims.size(), &trans_inp, t_out, t_ind, k);
+    funcs::TransCompute<phi::CPUContext, int64_t>(
+        ndims, dev_ctx, tmp_indices, indices, trans);
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, tmp_out, output, trans);
+    if (!keepdim) {
+      output->Resize(out_dims);
+      indices->Resize(out_dims);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(kthvalue,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::KthvalueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
new file mode 100644
index 0000000000000..cee48ed96db1c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
@@ -0,0 +1,186 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_grad_kernel.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/funcs/layer_norm_util.h"
+#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
+    !defined(__OSX__)
+#include "paddle/fluid/operators/jit/kernels.h"
+#endif
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& mean,
+                         const DenseTensor& variance,
+                         paddle::optional<const DenseTensor&> scale_opt,
+                         paddle::optional<const DenseTensor&> bias_opt,
+                         const DenseTensor& out_grad,
+                         float epsilon,
+                         int begin_norm_axis,
+                         bool is_test,
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad) {
+  auto* scale = scale_opt.get_ptr();
+  auto d_y = out_grad;
+
+  // init output
+  auto* d_x = x_grad;
+  auto* d_scale = scale_grad;
+  auto* d_bias = bias_grad;
+
+  const auto& x_dims = x.dims();
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int left = static_cast<int>(matrix_dim[0]);
+  int right = static_cast<int>(matrix_dim[1]);
+  DDim matrix_shape({left, right});
+
+  d_y.Resize(matrix_shape);
+
+  funcs::ColwiseSum2D<phi::CPUContext, T> colwise_sum(left, right, dev_ctx);
+  DenseTensor x_tmp = x;
+
+  DenseTensor temp;
+  DenseTensor temp_norm;
+  if (d_scale || d_x) {
+    x_tmp.Resize(matrix_shape);
+    temp.Resize(matrix_shape);
+    dev_ctx.template Alloc<T>(&temp);
+
+    temp_norm.Resize(matrix_shape);
+    dev_ctx.template Alloc<T>(&temp_norm);
+    // get x_norm
+    phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+        dev_ctx,
+        x_tmp,
+        mean,
+        /*axis*/ 0,
+        funcs::SubtractFunctor<T>(),
+        &temp_norm);
+    phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T, T>(
+        dev_ctx,
+        temp_norm,
+        variance,
+        /*axis*/ 0,
+        funcs::DivAndSqrtFunctor<T>(static_cast<T>(epsilon)),
+        &temp_norm);
+  }
+
+  if (d_bias) {
+    dev_ctx.template Alloc<T>(d_bias);
+    colwise_sum(dev_ctx, d_y, d_bias);
+  }
+  if (d_scale) {
+    dev_ctx.template Alloc<T>(d_scale);
+    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+        dev_ctx, temp_norm, d_y, 0, funcs::MultiplyFunctor<T>(), &temp);
+    colwise_sum(dev_ctx, temp, d_scale);
+  }
+
+  if (d_x) {
+    DDim vec_shape({left});
+    dev_ctx.template Alloc<T>(d_x);
+    auto dx_dim = d_x->dims();
+    DenseTensor temp_vec;
+    temp_vec.Resize(vec_shape);
+    dev_ctx.template Alloc<T>(&temp_vec);
+
+    funcs::RowwiseMean2D<phi::CPUContext, T> row_mean(left, right, dev_ctx);
+
+    if (d_scale) {
+      // dy_dx
+      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+          dev_ctx, d_y, *scale, /*axis*/ 1, funcs::MultiplyFunctor<T>(), &temp);
+      phi::Copy<Context>(dev_ctx, temp, dev_ctx.GetPlace(), false, d_x);
+
+      // dy_dmean_dx
+      row_mean(dev_ctx, temp, &temp_vec);
+      phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+          dev_ctx,
+          *d_x,
+          temp_vec,
+          /*axis*/ 0,
+          funcs::SubtractFunctor<T>(),
+          d_x);
+
+      // dy_var_dx
+      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+          dev_ctx,
+          temp,
+          temp_norm,
+          /*axis*/ 0,
+          funcs::MultiplyFunctor<T>(),
+          &temp);
+    } else {
+      // dy_dx
+      phi::Copy<Context>(dev_ctx, d_y, dev_ctx.GetPlace(), false, d_x);
+
+      // dy_dmean_dx
+      row_mean(dev_ctx, d_y, &temp_vec);
+      phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+          dev_ctx,
+          *d_x,
+          temp_vec,
+          /*axis*/ 0,
+          funcs::SubtractFunctor<T>(),
+          d_x);
+
+      // dy_var_dx
+      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+          dev_ctx,
+          d_y,
+          temp_norm,
+          /*axis*/ 0,
+          funcs::MultiplyFunctor<T>(),
+          &temp);
+    }
+    // dy_var_dx
+    row_mean(dev_ctx, temp, &temp_vec);
+    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+        dev_ctx,
+        temp_norm,
+        temp_vec,
+        /*axis*/ 0,
+        funcs::MultiplyFunctor<T>(),
+        &temp);
+    phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+        dev_ctx, *d_x, temp, /*axis*/ 0, funcs::SubtractFunctor<T>(), d_x);
+
+    phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T, T>(
+        dev_ctx,
+        *d_x,
+        variance,
+        /*axis*/ 0,
+        funcs::DivAndSqrtFunctor<T>(static_cast<T>(epsilon)),
+        d_x);
+    d_x->Resize(dx_dim);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    layer_norm_grad, CPU, ALL_LAYOUT, phi::LayerNormGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/cpu/layer_norm_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
new file mode 100644
index 0000000000000..5b09d68c7ca08
--- /dev/null
+++ b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_kernel.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/funcs/layer_norm_util.h"
+#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
+    !defined(__OSX__)
+#include "paddle/fluid/operators/jit/kernels.h"
+#endif
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     paddle::optional<const DenseTensor&> scale_opt,
+                     paddle::optional<const DenseTensor&> bias_opt,
+                     float epsilon,
+                     int begin_norm_axis,
+                     bool is_test,
+                     DenseTensor* y,
+                     DenseTensor* mean,
+                     DenseTensor* var) {
+  const auto x_dims = x.dims();
+  auto* scale = scale_opt.get_ptr();
+  auto* bias = bias_opt.get_ptr();
+
+  dev_ctx.template Alloc<T>(y);
+  dev_ctx.template Alloc<T>(mean);
+  dev_ctx.template Alloc<T>(var);
+
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int left = static_cast<int>(matrix_dim[0]);
+  int right = static_cast<int>(matrix_dim[1]);
+  DDim matrix_shape({left, right});
+
+  auto x_tmp = x;
+  x_tmp.Resize(matrix_shape);
+  DenseTensor out;
+  out.ShareDataWith(*y);
+  out.Resize(matrix_shape);
+
+#if defined(PADDLE_WITH_CUDA) || defined(_WIN32) || defined(__APPLE__) || \
+    defined(__OSX__)
+
+  funcs::RowwiseMean2D<phi::CPUContext, T> row_mean(left, right, dev_ctx);
+
+  // get mean
+  row_mean(dev_ctx, x_tmp, mean);
+
+  // get variance
+
+  phi::funcs::ElementwiseCompute<funcs::SubAndSquareFunctor<T>, T, T>(
+      dev_ctx, x_tmp, *mean, 0, funcs::SubAndSquareFunctor<T>(), &out);
+
+  row_mean(dev_ctx, out, var);
+
+  // get x_norm
+  phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+      dev_ctx, x_tmp, *mean, 0, funcs::SubtractFunctor<T>(), &out);
+
+  phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T, T>(
+      dev_ctx,
+      out,
+      *var,
+      0,
+      funcs::DivAndSqrtFunctor<T>(static_cast<T>(epsilon)),
+      &out);
+
+  if (scale) {
+    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+        dev_ctx, out, *scale, 1, funcs::MultiplyFunctor<T>(), &out);
+  }
+  if (bias) {
+    phi::funcs::ElementwiseCompute<funcs::AddFunctor<T>, T, T>(
+        dev_ctx, out, *bias, 1, funcs::AddFunctor<T>(), &out);
+  }
+#else
+  PADDLE_ENFORCE_EQ(mean->numel(),
+                    left,
+                    phi::errors::InvalidArgument(
+                        "mean's length (%d) is not equal with expected (%d).",
+                        mean->numel(),
+                        left));
+  PADDLE_ENFORCE_EQ(var->numel(),
+                    left,
+                    phi::errors::InvalidArgument(
+                        "var's length (%d) is not equal with expected (%d).",
+                        var->numel(),
+                        left));
+  if (scale) {
+    PADDLE_ENFORCE_EQ(
+        scale->numel(),
+        right,
+        phi::errors::InvalidArgument(
+            "scale's length (%d) is not equal with expected (%d).",
+            scale->numel(),
+            right));
+  }
+  if (bias) {
+    PADDLE_ENFORCE_EQ(bias->numel(),
+                      right,
+                      phi::errors::InvalidArgument(
+                          "bias's length (%d) is not equal with expected (%d).",
+                          bias->numel(),
+                          right));
+  }
+
+  auto ker = paddle::operators::jit::KernelFuncs<
+                 paddle::operators::jit::LayerNormTuple<T>,
+                 phi::CPUPlace>::Cache()
+                 .At(right);
+  ker(x_tmp.data<T>(),
+      out.data<T>(),
+      mean->data<T>(),
+      var->data<T>(),
+      scale ? scale->data<T>() : nullptr,
+      bias ? bias->data<T>() : nullptr,
+      static_cast<int>(left),
+      static_cast<const float>(epsilon),
+      right);
+#endif
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    layer_norm, CPU, ALL_LAYOUT, phi::LayerNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc b/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc
new file mode 100644
index 0000000000000..116fa3f8d3f6a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc
@@ -0,0 +1,20 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/lgamma_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h"
+PD_REGISTER_KERNEL(
+    lgamma_grad, CPU, ALL_LAYOUT, phi::LgammaGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/lgamma_kernel.cc b/paddle/phi/kernels/cpu/lgamma_kernel.cc
new file mode 100644
index 0000000000000..f849322174d29
--- /dev/null
+++ b/paddle/phi/kernels/cpu/lgamma_kernel.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/lgamma_kernel.h"
+
+#include <unsupported/Eigen/SpecialFunctions>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+namespace phi {
+template <typename T>
+struct LgammaFunctor {
+  LgammaFunctor(const T* input, T* output, int64_t numel)
+      : input_(input), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    output_[idx] = Eigen::numext::lgamma(input_[idx]);
+  }
+
+ private:
+  const T* input_;
+  T* output_;
+  int64_t numel_;
+};
+
+template <typename T, typename Context>
+void LgammaKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) {
+  auto numel = x.numel();
+  auto* x_data = x.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  LgammaFunctor<T> functor(x_data, out_data, numel);
+  for_range(functor);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(lgamma, CPU, ALL_LAYOUT, phi::LgammaKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc
new file mode 100644
index 0000000000000..5f344b9cc3fe0
--- /dev/null
+++ b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_softmax_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrixTemplate = EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Context, typename T>
+struct LogSoftmaxGradFunctor {
+  void operator()(const Context& context,
+                  const DenseTensor* Y,
+                  const DenseTensor* dY,
+                  DenseTensor* dX,
+                  const int axis) {
+    constexpr int kBatchDim = 0;
+    constexpr int kClassDim = 1;
+
+    const int n = funcs::SizeToAxis(axis, Y->dims());
+    const int d = funcs::SizeFromAxis(axis, Y->dims());
+    phi::DDim dim_2d{n, d};
+
+    auto y = EigenMatrixTemplate<T>::From(*Y, dim_2d);
+    auto dy = EigenMatrixTemplate<T>::From(*dY, dim_2d);
+    auto dx = EigenMatrixTemplate<T>::From(*dX, dim_2d);
+
+    const int axis_dim = Y->dims()[axis];
+    const int batch_size = y.dimension(kBatchDim);
+    const int num_classes = y.dimension(kClassDim);
+    const int num_remain = num_classes / axis_dim;
+
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
+
+    dx.device(*context.eigen_device()) =
+        dy -
+        (y.exp()) * (dy.reshape(batch_axis_remain)
+                         .sum(along_class)
+                         .broadcast(one_axis));
+  }
+};
+
+template <typename T, typename Context>
+void LogSoftmaxGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& out_grad,
+                          int axis,
+                          DenseTensor* x_grad) {
+  const int rank = out.dims().size();
+  const int canonical_axis = funcs::CanonicalAxis(axis, rank);
+
+  dev_ctx.template Alloc<T>(x_grad);
+  if (out.numel() != 0) {
+    LogSoftmaxGradFunctor<Context, T>()(
+        dev_ctx, &out, &out_grad, x_grad, canonical_axis);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(log_softmax_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::LogSoftmaxGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/log_softmax_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_kernel.cc
new file mode 100644
index 0000000000000..241742378cc5d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/log_softmax_kernel.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_softmax_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrixTemplate = EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+struct ValueClip {
+  HOSTDEVICE T operator()(const T& x) const {
+    const T kThreshold = static_cast<T>(-64.);
+    return x < kThreshold ? kThreshold : x;
+  }
+};
+
+template <typename Context, typename T>
+struct LogSoftmaxFunctor {
+  void operator()(const Context& context,
+                  const DenseTensor* X,
+                  DenseTensor* Y,
+                  const int axis) {
+    constexpr int kBatchDim = 0;
+    constexpr int kClassDim = 1;
+    constexpr int kAxisDim = 1;
+
+    int axis_dim = X->dims()[axis];
+    const int n = funcs::SizeToAxis(axis, X->dims());
+    const int d = funcs::SizeFromAxis(axis, X->dims());
+    phi::DDim dim_2d{n, d};
+
+    auto logits = EigenMatrixTemplate<T>::From(*X, dim_2d);
+    auto log_softmax = EigenMatrixTemplate<T>::From(*Y, dim_2d);
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+    const int num_remain = num_classes / axis_dim;
+
+    Eigen::DSizes<int, 1> along_axis(kAxisDim);
+    Eigen::DSizes<int, 2> batch_classes(batch_size, num_classes);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+    Eigen::DSizes<int, 3> batch_one_remain(batch_size, 1, num_remain);
+    Eigen::DSizes<int, 3> one_axis_one(1, axis_dim, 1);
+    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
+    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+
+    // For numerical stability, logits should be shifted by maximum number along
+    // axis, calculate shifted_logits into log_softmax tensor for memory reuse.
+    if (num_remain == 1) {
+      // axis == -1, axis and class in same dimension, calculate along
+      // class dimension directly for higher performance
+      log_softmax.device(*context.eigen_device()) =
+          (logits -
+           logits.maximum(along_axis)
+               .eval()
+               .reshape(batch_by_one)
+               .broadcast(one_by_class))
+              .unaryExpr(ValueClip<T>());
+    } else {
+      // axis != -1, class dimension split into (axis, remain), max and sum
+      // should be calculated along axis dimension
+      log_softmax.device(*context.eigen_device()) =
+          (logits.reshape(batch_axis_remain) -
+           logits.reshape(batch_axis_remain)
+               .maximum(along_axis)
+               .eval()
+               .reshape(batch_one_remain)
+               .broadcast(one_axis_one)
+               .reshape(batch_classes))
+              .unaryExpr(ValueClip<T>());
+    }
+
+    log_softmax.device(*context.eigen_device()) =
+        log_softmax -
+        log_softmax.exp()
+            .eval()
+            .reshape(batch_axis_remain)
+            .sum(along_axis)
+            .log()
+            .broadcast(one_axis);
+  }
+};
+
+template <typename T, typename Context>
+void LogSoftmaxKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      int axis,
+                      DenseTensor* out) {
+  const int rank = x.dims().size();
+  const int canonical_axis = funcs::CanonicalAxis(axis, rank);
+
+  dev_ctx.template Alloc<T>(out);
+  if (x.numel() != 0) {
+    LogSoftmaxFunctor<Context, T>()(dev_ctx, &x, out, canonical_axis);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    log_softmax, CPU, ALL_LAYOUT, phi::LogSoftmaxKernel, float, double) {}
diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc
similarity index 50%
rename from paddle/fluid/operators/arg_max_op.cu
rename to paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc
index 14708c4df10f5..ae3b4d2b45582 100644
--- a/paddle/fluid/operators/arg_max_op.cu
+++ b/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc
@@ -1,22 +1,26 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.cu.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    arg_max, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMax>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMax>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMax>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMax>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMax>);
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/matrix_power_grad_kernel.h"
+#include "paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(matrix_power_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MatrixPowerGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/matrix_power_kernel.cc b/paddle/phi/kernels/cpu/matrix_power_kernel.cc
new file mode 100644
index 0000000000000..f40e1e616f526
--- /dev/null
+++ b/paddle/phi/kernels/cpu/matrix_power_kernel.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/matrix_power_kernel.h"
+#include "paddle/phi/kernels/impl/matrix_power_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    matrix_power, CPU, ALL_LAYOUT, phi::MatrixPowerKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/matrix_rank_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_kernel.cc
new file mode 100644
index 0000000000000..5e13abe8aed2c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/matrix_rank_kernel.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/matrix_rank_kernel.h"
+#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixRankKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      float tol,
+                      bool use_default_tol,
+                      bool hermitian,
+                      DenseTensor* out) {
+  DenseTensor atol_tensor;
+  if (use_default_tol) {
+    atol_tensor = phi::Full<T, Context>(dev_ctx, {1}, static_cast<T>(0));
+  } else {
+    atol_tensor = phi::Full<T, Context>(dev_ctx, {1}, static_cast<T>(tol));
+  }
+  MatrixRankTolKernel<T, Context>(
+      dev_ctx, x, atol_tensor, use_default_tol, hermitian, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    matrix_rank, CPU, ALL_LAYOUT, phi::MatrixRankKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
new file mode 100644
index 0000000000000..ae1e406d16eec
--- /dev/null
+++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
+
+#include <Eigen/Dense>
+#include <Eigen/SVD>
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
+
+namespace phi {
+
+template <typename T>
+void BatchEigenvalues(const T* x_data,
+                      T* eigenvalues_data,
+                      int batches,
+                      int rows,
+                      int cols,
+                      int k) {
+  // Eigen::Matrix API need non-const pointer.
+  T* input = const_cast<T*>(x_data);
+  int stride = rows * cols;
+  for (int i = 0; i < batches; i++) {
+    auto m = Eigen::Map<
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
+        input + i * stride, rows, rows);
+    Eigen::SelfAdjointEigenSolver<
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+        eigen_solver(m);
+    auto eigenvalues = eigen_solver.eigenvalues().cwiseAbs();
+    for (int j = 0; j < k; j++) {
+      *(eigenvalues_data + i * k + j) = eigenvalues[j];
+    }
+  }
+}
+
+template <typename T>
+void BatchSVD(const T* x_data,
+              T* eigenvalues_data,
+              int batches,
+              int rows,
+              int cols,
+              int k) {
+  // Eigen::Matrix API need non-const pointer.
+  T* input = const_cast<T*>(x_data);
+  int stride = rows * cols;
+  Eigen::BDCSVD<
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+      svd;
+  for (int i = 0; i < batches; i++) {
+    auto m = Eigen::Map<
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
+        input + i * stride, rows, cols);
+    svd.compute(m);
+    auto res_s = svd.singularValues();
+    for (int j = 0; j < k; j++) {
+      eigenvalues_data[i * k + j] = res_s[j];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void MatrixRankTolKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& atol_tensor,
+                         bool use_default_tol,
+                         bool hermitian,
+                         DenseTensor* out) {
+  auto* x_data = x.data<T>();
+  dev_ctx.template Alloc<int64_t>(out);
+  auto dim_x = x.dims();
+  auto dim_out = out->dims();
+  int rows = dim_x[dim_x.size() - 2];
+  int cols = dim_x[dim_x.size() - 1];
+  int k = std::min(rows, cols);
+  auto numel = x.numel();
+  int batches = numel / (rows * cols);
+
+  T rtol_T = 0;
+
+  if (use_default_tol) {
+    rtol_T = std::numeric_limits<T>::epsilon() * std::max(rows, cols);
+  }
+
+  DenseTensor eigenvalue_tensor;
+  eigenvalue_tensor.Resize(detail::GetEigenvalueDim(dim_x, k));
+  auto* eigenvalue_data = dev_ctx.template Alloc<T>(&eigenvalue_tensor);
+
+  if (hermitian) {
+    BatchEigenvalues<T>(x_data, eigenvalue_data, batches, rows, cols, k);
+  } else {
+    BatchSVD<T>(x_data, eigenvalue_data, batches, rows, cols, k);
+  }
+
+  DenseTensor max_eigenvalue_tensor;
+  max_eigenvalue_tensor.Resize(detail::RemoveLastDim(eigenvalue_tensor.dims()));
+  dev_ctx.template Alloc<T>(&max_eigenvalue_tensor);
+  phi::MaxKernel<T, Context>(dev_ctx,
+                             eigenvalue_tensor,
+                             std::vector<int64_t>{-1},
+                             false,
+                             &max_eigenvalue_tensor);
+
+  DenseTensor temp_rtol_tensor;
+  temp_rtol_tensor =
+      phi::Full<T, Context>(dev_ctx, {1}, static_cast<T>(rtol_T));
+
+  DenseTensor rtol_tensor =
+      phi::Multiply<T>(dev_ctx, temp_rtol_tensor, max_eigenvalue_tensor);
+
+  DenseTensor tol_tensor;
+  tol_tensor.Resize(dim_out);
+  dev_ctx.template Alloc<T>(&tol_tensor);
+  funcs::ElementwiseCompute<GreaterElementFunctor<T>, T, T>(
+      dev_ctx,
+      atol_tensor,
+      rtol_tensor,
+      -1,
+      GreaterElementFunctor<T>(),
+      &tol_tensor);
+
+  tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1));
+
+  DenseTensor compare_result;
+  compare_result.Resize(detail::NewAxisDim(dim_out, k));
+  dev_ctx.template Alloc<int64_t>(&compare_result);
+  int axis = -1;
+  if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) {
+    funcs::ElementwiseCompute<funcs::GreaterThanFunctor<T, int64_t>, T, int>(
+        dev_ctx,
+        eigenvalue_tensor,
+        tol_tensor,
+        axis,
+        funcs::GreaterThanFunctor<T, int64_t>(),
+        &compare_result);
+  } else {
+    funcs::ElementwiseCompute<funcs::LessThanFunctor<T, int64_t>, T, int>(
+        dev_ctx,
+        eigenvalue_tensor,
+        tol_tensor,
+        axis,
+        funcs::LessThanFunctor<T, int64_t>(),
+        &compare_result);
+  }
+
+  phi::SumKernel<int64_t>(dev_ctx,
+                          compare_result,
+                          std::vector<int64_t>{-1},
+                          compare_result.dtype(),
+                          false,
+                          out);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    matrix_rank_tol, CPU, ALL_LAYOUT, phi::MatrixRankTolKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/cpu/mode_grad_kernel.cc b/paddle/phi/kernels/cpu/mode_grad_kernel.cc
new file mode 100644
index 0000000000000..ca813c1757eac
--- /dev/null
+++ b/paddle/phi/kernels/cpu/mode_grad_kernel.cc
@@ -0,0 +1,170 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mode_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/mode.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ModeGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    const DenseTensor& out_grad,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* x_grad) {
+  auto in_dims = x.dims();
+  auto out_dims = indices.dims();
+
+  // axis < 0, get the real axis
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+
+  if (!keepdim) {
+    std::vector<int> tmp_out_shape;
+    for (int i = 0; i < axis; i++) {
+      tmp_out_shape.emplace_back(out_dims[i]);
+    }
+    tmp_out_shape.emplace_back(1);
+    for (int i = axis + 1; i < in_dims.size(); i++) {
+      tmp_out_shape.emplace_back(out_dims[i - 1]);
+    }
+    out_dims = phi::make_ddim(tmp_out_shape);
+  }
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+
+  if (axis == in_dims.size() - 1) {
+    // allocate the memory for the input_grad
+    // assign the out_grad to input_grad directly
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+
+    // init the output grad with 0, because some input elements has no grad
+    memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
+    // Assign the output_grad to input_grad
+    if (keepdim) {
+      funcs::ModeAssign(input_height,
+                        input_width,
+                        in_dims.size(),
+                        &out_grad,
+                        &indices,
+                        x_grad_data);
+    } else {
+      DenseTensor out_grad_tmp;
+      dev_ctx.template Alloc<T>(&out_grad_tmp);
+      DenseTensor indices_tmp;
+      dev_ctx.template Alloc<int64_t>(&indices_tmp);
+
+      phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, &out_grad_tmp);
+      phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
+
+      out_grad_tmp.Resize(out_dims);
+      indices_tmp.Resize(out_dims);
+
+      funcs::ModeAssign(input_height,
+                        input_width,
+                        in_dims.size(),
+                        &out_grad_tmp,
+                        &indices_tmp,
+                        x_grad_data);
+    }
+  } else {
+    // can not assign grad to input_grad, must do the transpose
+    std::vector<int> trans_axis;
+    for (int i = 0; i < axis; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(out_dims.size() - 1);
+    for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(axis);
+    DDim trans_shape(out_dims);
+    DDim trans_in_shape(in_dims);
+    for (size_t i = 0; i < trans_axis.size(); i++) {
+      trans_shape[i] = out_dims[trans_axis[i]];
+      trans_in_shape[i] = in_dims[trans_axis[i]];
+    }
+    // transpose the out_grad, indices
+    DenseTensor trans_dO;
+    trans_dO.Resize(trans_shape);
+    dev_ctx.template Alloc<T>(&trans_dO);
+
+    DenseTensor trans_ind;
+    trans_ind.Resize(trans_shape);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+
+    int ndims = trans_axis.size();
+
+    if (keepdim) {
+      // Do transpose
+      funcs::TransCompute<CPUContext, T>(
+          ndims, dev_ctx, out_grad, &trans_dO, trans_axis);
+      funcs::TransCompute<CPUContext, int64_t>(
+          ndims, dev_ctx, indices, &trans_ind, trans_axis);
+    } else {
+      DenseTensor out_grad_tmp;
+      dev_ctx.template Alloc<T>(&out_grad_tmp);
+
+      DenseTensor indices_tmp;
+      dev_ctx.template Alloc<int64_t>(&indices_tmp);
+
+      phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, &out_grad_tmp);
+      phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
+      out_grad_tmp.Resize(out_dims);
+      indices_tmp.Resize(out_dims);
+      // Do transpose
+      funcs::TransCompute<CPUContext, T>(
+          ndims, dev_ctx, out_grad_tmp, &trans_dO, trans_axis);
+      funcs::TransCompute<CPUContext, int64_t>(
+          ndims, dev_ctx, indices_tmp, &trans_ind, trans_axis);
+    }
+    const int64_t input_height = phi::product(
+        phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1));
+    const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1];
+
+    // Assign the out_grad to tranpose input_grad
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_in_shape);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    memset(t_out, 0, x_grad->numel() * sizeof(T));
+
+    funcs::ModeAssign<T, int64_t>(input_height,
+                                  input_width,
+                                  in_dims.size(),
+                                  &trans_dO,
+                                  &trans_ind,
+                                  t_out);
+
+    // Transpose back
+    funcs::TransCompute<CPUContext, T>(
+        ndims, dev_ctx, tmp_out, x_grad, trans_axis);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(mode_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ModeGradKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/mode_kernel.cc b/paddle/phi/kernels/cpu/mode_kernel.cc
new file mode 100644
index 0000000000000..6535d1b89af42
--- /dev/null
+++ b/paddle/phi/kernels/cpu/mode_kernel.cc
@@ -0,0 +1,121 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mode_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/mode.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ModeKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int axis,
+                bool keepdim,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  const auto& in_dims = x.dims();
+  auto out_dims = out->dims();
+  // axis < 0, cacluate the real axis
+  if (axis < 0) axis += in_dims.size();
+
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+  // if axis is not the last dim, transpose it to the last dim, do the
+  // calculation, then tranpose it back to original axis.
+  if (axis == in_dims.size() - 1) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    funcs::GetMode<T, int64_t>(input_height,
+                               input_width,
+                               in_dims.size(),
+                               &x,
+                               output_data,
+                               indices_data);
+  } else {
+    std::vector<int> trans_axis;
+    for (int i = 0; i < axis; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(axis);
+
+    if (!keepdim) {
+      std::vector<int> tmp_out_shape;
+      for (int i = 0; i < axis; i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      tmp_out_shape.emplace_back(1);
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
+      out->Resize(tmp_out_dim);
+      indices->Resize(tmp_out_dim);
+    }
+
+    // get the trans input_dims, out_dims
+    DDim trans_shape(in_dims);
+    DDim trans_out_shape(in_dims);
+
+    for (size_t i = 0; i < trans_axis.size(); i++) {
+      trans_shape[i] = in_dims[trans_axis[i]];
+      trans_out_shape[i] = in_dims[trans_axis[i]];
+    }
+    trans_out_shape[in_dims.size() - 1] = 1;
+
+    DenseTensor trans_input;
+    trans_input.Resize(trans_shape);
+    dev_ctx.template Alloc<T>(&trans_input);
+    int ndims = trans_axis.size();
+
+    // transpose the input value
+    funcs::TransCompute<CPUContext, T>(
+        ndims, dev_ctx, x, &trans_input, trans_axis);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
+    const int64_t input_width = trans_shape[trans_shape.size() - 1];
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_out_shape);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+
+    DenseTensor tmp_indices;
+    tmp_indices.Resize(trans_out_shape);
+    int64_t* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
+
+    funcs::GetMode<T, int64_t>(
+        input_height, input_width, in_dims.size(), &trans_input, t_out, t_ind);
+    // transpose back
+    funcs::TransCompute<CPUContext, int64_t>(
+        ndims, dev_ctx, tmp_indices, indices, trans_axis);
+    funcs::TransCompute<CPUContext, T>(
+        ndims, dev_ctx, tmp_out, out, trans_axis);
+    if (!keepdim) {
+      out->Resize(out_dims);
+      indices->Resize(out_dims);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    mode, CPU, ALL_LAYOUT, phi::ModeKernel, float, double, int32_t, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc
new file mode 100644
index 0000000000000..f5a426e93db2c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/multiplex_grad_kernel.h"
+
+#include "paddle/fluid/memory/memcpy.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplexGradKernel(const Context& ctx,
+                         const DenseTensor& ids,
+                         const DenseTensor& out_grad,
+                         std::vector<DenseTensor*> ins_grad) {
+  size_t idx = -1UL;
+  for (size_t i = 0; i < ins_grad.size(); i++) {
+    if (ins_grad[i]) {
+      ctx.template Alloc<T>(ins_grad[i]);
+      auto t = phi::EigenVector<T>::Flatten(*ins_grad[i]);
+      t.device(*ctx.eigen_device()) = t.constant(static_cast<T>(0));
+      idx = i;
+    }
+  }
+  if (idx == -1UL) return;
+
+  auto rows = ins_grad[idx]->dims()[0];
+  auto cols = ins_grad[idx]->numel() / rows;
+  auto* index = ids.data<int32_t>();
+  for (auto i = 0; i < rows; i++) {
+    size_t k = static_cast<size_t>(index[i]);
+    if (ins_grad[k]) {
+      paddle::memory::Copy(ctx.GetPlace(),
+                           ins_grad[k]->data<T>() + i * cols,
+                           ctx.GetPlace(),
+                           out_grad.data<T>() + i * cols,
+                           cols * sizeof(T));
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multiplex_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplexGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/multiplex_kernel.cc b/paddle/phi/kernels/cpu/multiplex_kernel.cc
new file mode 100644
index 0000000000000..2d9f4c51a981e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/multiplex_kernel.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/multiplex_kernel.h"
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplexKernel(const Context& ctx,
+                     const std::vector<const DenseTensor*>& ins,
+                     const DenseTensor& ids,
+                     DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+  for (size_t i = 0; i < ins.size(); ++i) {
+    PADDLE_ENFORCE_GT(
+        ins[i]->numel(),
+        0,
+        errors::OutOfRange(
+            "indexing will be out of bounds with size 0 for the %d-th input.",
+            i));
+  }
+  auto rows = ins[0]->dims()[0];
+  auto cols = ins[0]->numel() / rows;
+  auto index = ids.data<int32_t>();
+  for (auto i = 0; i < rows; i++) {
+    int32_t k = index[i];
+    PADDLE_ENFORCE_GE(
+        k, 0, errors::PreconditionNotMet("index must be nonnegative."));
+    PADDLE_ENFORCE_LT(static_cast<size_t>(k),
+                      ins.size(),
+                      errors::PreconditionNotMet(
+                          "index exceeds the number of candidate tensors."));
+    paddle::memory::Copy(ctx.GetPlace(),
+                         out->data<T>() + i * cols,
+                         ctx.GetPlace(),
+                         ins[k]->data<T>() + i * cols,
+                         cols * sizeof(T));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multiplex,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplexKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/fluid/operators/one_hot_v2_op.h b/paddle/phi/kernels/cpu/one_hot_kernel.cc
similarity index 50%
rename from paddle/fluid/operators/one_hot_v2_op.h
rename to paddle/phi/kernels/cpu/one_hot_kernel.cc
index 9d42c5875bb6e..dc58489ebf70e 100644
--- a/paddle/fluid/operators/one_hot_v2_op.h
+++ b/paddle/phi/kernels/cpu/one_hot_kernel.cc
@@ -1,4 +1,4 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,23 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/kernels/one_hot_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
 
 template <typename DeviceContext, typename InT>
 struct OneHotV2OpFunctor {
-  const framework::LoDTensor* in_;
-  framework::LoDTensor* out_;
+  const DenseTensor* in_;
+  DenseTensor* out_;
   int depth_;
   const DeviceContext& ctx_;
   bool allow_out_of_range_;
 
-  OneHotV2OpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
-                    int depth, const DeviceContext& ctx,
+  OneHotV2OpFunctor(const DenseTensor* in,
+                    DenseTensor* out,
+                    int depth,
+                    const DeviceContext& ctx,
                     bool allow_out_of_range = false)
       : in_(in),
         out_(out),
@@ -40,8 +42,8 @@ struct OneHotV2OpFunctor {
   void apply() const {
     auto* p_in_data = in_->data<InT>();
     auto numel = in_->numel();
-    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
-    phi::funcs::set_constant(ctx_, out_, 0.0);
+    auto* p_out_data = ctx_.template Alloc<OutT>(out_);
+    funcs::set_constant(ctx_, out_, 0.0);
 
     if (allow_out_of_range_) {
       for (int i = 0; i < numel; ++i) {
@@ -52,51 +54,46 @@ struct OneHotV2OpFunctor {
     } else {
       for (int i = 0; i < numel; ++i) {
         PADDLE_ENFORCE_GE(
-            p_in_data[i], 0,
-            platform::errors::InvalidArgument(
+            p_in_data[i],
+            0,
+            phi::errors::InvalidArgument(
                 "Illegal index value, Input(input) value should be at least 0, "
                 "but received input (%d) less than 0",
                 p_in_data[i]));
         PADDLE_ENFORCE_LT(
-            p_in_data[i], depth_,
-            platform::errors::InvalidArgument(
+            p_in_data[i],
+            depth_,
+            phi::errors::InvalidArgument(
                 "Illegal index value, Input(input) value should be less than "
                 "Input(depth), "
                 "but received input (%d) not less than depth (%d)",
-                p_in_data[i], depth_));
+                p_in_data[i],
+                depth_));
         *(p_out_data + i * depth_ + p_in_data[i]) = 1.0;
       }
     }
   }
 };
 
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-template <typename DeviceContext, typename T>
-class OneHotV2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int depth = context.Attr<int>("depth");
-    bool allow_out_of_range = context.Attr<bool>("allow_out_of_range");
-    if (context.HasInput("depth_tensor")) {
-      auto* depth_tensor = context.Input<Tensor>("depth_tensor");
-      auto* depth_data = depth_tensor->data<int32_t>();
-      depth = depth_data[0];
-      auto out_dims = out->dims();
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    }
-
-    framework::VisitDataType(
-        static_cast<framework::proto::VarType::Type>(
-            context.Attr<int>("dtype")),
-        OneHotV2OpFunctor<DeviceContext, T>(
-            in, out, depth, context.template device_context<DeviceContext>(),
-            allow_out_of_range));
+template <typename T, typename Context>
+void OneHotRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     int32_t depth,
+                     DataType dtype,
+                     bool allow_out_of_range,
+                     DenseTensor* out) {
+  auto out_dims = out->dims();
+  if (out_dims[out_dims.size() - 1] == -1) {
+    out_dims[out_dims.size() - 1] = depth;
+    out->Resize(out_dims);
   }
-};
 
-}  // namespace operators
-}  // namespace paddle
+  phi::VisitDataType(dtype,
+                     OneHotV2OpFunctor<Context, T>(
+                         &x, out, depth, dev_ctx, allow_out_of_range));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    one_hot_raw, CPU, ALL_LAYOUT, phi::OneHotRawKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/pad3d_grad_kernel.cc b/paddle/phi/kernels/cpu/pad3d_grad_kernel.cc
new file mode 100644
index 0000000000000..b1adb3e206da9
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pad3d_grad_kernel.cc
@@ -0,0 +1,480 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad3d_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+void ConstPad3DGradNCDHW(T* d_in_data,
+                         const T* d_out_data,
+                         const int in_depth,
+                         const int in_height,
+                         const int in_width,
+                         const int out_depth,
+                         const int out_height,
+                         const int out_width,
+                         const int pad_front,
+                         const int pad_top,
+                         const int pad_left,
+                         const int out_d,
+                         const int out_h,
+                         const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+  if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+        in_h >= in_height || in_w >= in_width)) {
+    d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] =
+        d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+  }
+}
+
+template <typename T>
+void ConstPad3DGradNDHWC(T* d_in_data,
+                         const T* d_out_data,
+                         const int channels,
+                         const int in_depth,
+                         const int in_height,
+                         const int in_width,
+                         const int out_depth,
+                         const int out_height,
+                         const int out_width,
+                         const int pad_front,
+                         const int pad_top,
+                         const int pad_left,
+                         const int out_d,
+                         const int out_h,
+                         const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+        in_h >= in_height || in_w >= in_width)) {
+    const int in_index =
+        (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+    for (int c = 0; c < channels; ++c) {
+      d_in_data[in_index + c] = d_out_data[out_index + c];
+    }
+  }
+}
+
+template <typename T>
+void ReflectPad3DGradNCDHW(T* d_in_data,
+                           const T* d_out_data,
+                           const int in_depth,
+                           const int in_height,
+                           const int in_width,
+                           const int out_depth,
+                           const int out_height,
+                           const int out_width,
+                           const int pad_front,
+                           const int pad_top,
+                           const int pad_left,
+                           const int out_d,
+                           const int out_h,
+                           const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);                     // reflect by 0
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
+  in_h = std::max(in_h, -in_h);                     // reflect by 0
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+  in_w = std::max(in_w, -in_w);                     // reflect by 0
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+
+  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
+      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+}
+
+template <typename T>
+void ReflectPad3DGradNDHWC(T* d_in_data,
+                           const T* d_out_data,
+                           const int channels,
+                           const int in_depth,
+                           const int in_height,
+                           const int in_width,
+                           const int out_depth,
+                           const int out_height,
+                           const int out_width,
+                           const int pad_front,
+                           const int pad_top,
+                           const int pad_left,
+                           const int out_d,
+                           const int out_h,
+                           const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);
+  in_h = std::max(in_h, -in_h);
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);
+  in_w = std::max(in_w, -in_w);
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    d_in_data[in_index + c] += d_out_data[out_index + c];
+  }
+}
+
+template <typename T>
+void ReplicatePad3DGradNCDHW(T* d_in_data,
+                             const T* d_out_data,
+                             const int in_depth,
+                             const int in_height,
+                             const int in_width,
+                             const int out_depth,
+                             const int out_height,
+                             const int out_width,
+                             const int pad_front,
+                             const int pad_top,
+                             const int pad_left,
+                             const int out_d,
+                             const int out_h,
+                             const int out_w) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
+      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+}
+
+template <typename T>
+void ReplicatePad3DGradNDHWC(T* d_in_data,
+                             const T* d_out_data,
+                             const int channels,
+                             const int in_depth,
+                             const int in_height,
+                             const int in_width,
+                             const int out_depth,
+                             const int out_height,
+                             const int out_width,
+                             const int pad_front,
+                             const int pad_top,
+                             const int pad_left,
+                             const int out_d,
+                             const int out_h,
+                             const int out_w) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    d_in_data[in_index + c] += d_out_data[out_index + c];
+  }
+}
+
+template <typename T>
+void CircularPad3DGradNCDHW(T* d_in_data,
+                            const T* d_out_data,
+                            const int in_depth,
+                            const int in_height,
+                            const int in_width,
+                            const int out_depth,
+                            const int out_height,
+                            const int out_width,
+                            const int pad_front,
+                            const int pad_top,
+                            const int pad_left,
+                            const int out_d,
+                            const int out_h,
+                            const int out_w) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
+      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+}
+
+template <typename T>
+void CircularPad3DGradNDHWC(T* d_in_data,
+                            const T* d_out_data,
+                            const int channels,
+                            const int in_depth,
+                            const int in_height,
+                            const int in_width,
+                            const int out_depth,
+                            const int out_height,
+                            const int out_width,
+                            const int pad_front,
+                            const int pad_top,
+                            const int pad_left,
+                            const int out_d,
+                            const int out_h,
+                            const int out_w) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    d_in_data[in_index + c] += d_out_data[out_index + c];
+  }
+}
+
+template <typename T>
+void Pad3DGradNCDHW(T* d_in_data,
+                    const int num,
+                    const int channels,
+                    const int in_depth,
+                    const int in_height,
+                    const int in_width,
+                    const int out_depth,
+                    const int out_height,
+                    const int out_width,
+                    const int pad_front,
+                    const int pad_top,
+                    const int pad_left,
+                    const T* d_out_data,
+                    void (*pad_func)(T*,
+                                     const T*,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int)) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_d = 0; out_d < out_depth; ++out_d) {
+        for (int out_h = 0; out_h < out_height; ++out_h) {
+          for (int out_w = 0; out_w < out_width; ++out_w) {
+            pad_func(d_in_data,
+                     d_out_data,
+                     in_depth,
+                     in_height,
+                     in_width,
+                     out_depth,
+                     out_height,
+                     out_width,
+                     pad_front,
+                     pad_top,
+                     pad_left,
+                     out_d,
+                     out_h,
+                     out_w);
+          }
+        }
+      }
+      d_in_data += in_depth * in_height * in_width;
+      d_out_data += out_depth * out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad3DGradNDHWC(T* d_in_data,
+                    const int num,
+                    const int channels,
+                    const int in_depth,
+                    const int in_height,
+                    const int in_width,
+                    const int out_depth,
+                    const int out_height,
+                    const int out_width,
+                    const int pad_front,
+                    const int pad_top,
+                    const int pad_left,
+                    const T* d_out_data,
+                    void (*pad_func)(T*,
+                                     const T*,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int)) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_d = 0; out_d < out_depth; ++out_d) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          pad_func(d_in_data,
+                   d_out_data,
+                   channels,
+                   in_depth,
+                   in_height,
+                   in_width,
+                   out_depth,
+                   out_height,
+                   out_width,
+                   pad_front,
+                   pad_top,
+                   pad_left,
+                   out_d,
+                   out_h,
+                   out_w);
+        }
+      }
+    }
+    d_in_data += in_depth * in_height * in_width * channels;
+    d_out_data += out_depth * out_height * out_width * channels;
+  }
+}
+
+template <typename T, typename Context>
+void Pad3dGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& out_grad,
+                     const ScalarArray& paddings,
+                     const std::string& mode,
+                     float pad_value,
+                     const std::string& data_format,
+                     DenseTensor* x_grad) {
+  std::vector<int64_t> pads = paddings.GetData();
+
+  auto* d_out = &out_grad;
+  auto* d_in = x_grad;
+  auto d_in_dims = d_in->dims();
+  auto d_out_dims = d_out->dims();
+  const T* d_out_data = d_out->data<T>();
+  T* d_in_data = dev_ctx.template Alloc<T>(d_in);
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, d_in, static_cast<T>(0));
+
+  const int pad_left = pads[0];
+  const int pad_top = pads[2];
+  const int pad_front = pads[4];
+  const int num = d_in_dims[0];
+  if (data_format == "NCDHW") {
+    const int channels = d_in_dims[1];
+    const int in_depth = d_in_dims[2];
+    const int in_height = d_in_dims[3];
+    const int in_width = d_in_dims[4];
+    const int out_depth = d_out_dims[2];
+    const int out_height = d_out_dims[3];
+    const int out_width = d_out_dims[4];
+
+    std::map<std::string,
+             void (*)(T*,
+                      const T*,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int)>
+        func_map;
+
+    func_map["reflect"] = ReflectPad3DGradNCDHW;
+    func_map["replicate"] = ReplicatePad3DGradNCDHW;
+    func_map["circular"] = CircularPad3DGradNCDHW;
+    func_map["constant"] = ConstPad3DGradNCDHW;
+
+    Pad3DGradNCDHW(d_in_data,
+                   num,
+                   channels,
+                   in_depth,
+                   in_height,
+                   in_width,
+                   out_depth,
+                   out_height,
+                   out_width,
+                   pad_front,
+                   pad_top,
+                   pad_left,
+                   d_out_data,
+                   func_map[mode]);
+  } else {
+    const int channels = d_in_dims[4];
+    const int in_depth = d_in_dims[1];
+    const int in_height = d_in_dims[2];
+    const int in_width = d_in_dims[3];
+    const int out_depth = d_out_dims[1];
+    const int out_height = d_out_dims[2];
+    const int out_width = d_out_dims[3];
+
+    std::map<std::string,
+             void (*)(T*,
+                      const T*,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int)>
+        func_map;
+
+    func_map["reflect"] = ReflectPad3DGradNDHWC;
+    func_map["replicate"] = ReplicatePad3DGradNDHWC;
+    func_map["circular"] = CircularPad3DGradNDHWC;
+    func_map["constant"] = ConstPad3DGradNDHWC;
+
+    Pad3DGradNDHWC(d_in_data,
+                   num,
+                   channels,
+                   in_depth,
+                   in_height,
+                   in_width,
+                   out_depth,
+                   out_height,
+                   out_width,
+                   pad_front,
+                   pad_top,
+                   pad_left,
+                   d_out_data,
+                   func_map[mode]);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    pad3d_grad, CPU, ALL_LAYOUT, phi::Pad3dGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/pad3d_kernel.cc b/paddle/phi/kernels/cpu/pad3d_kernel.cc
new file mode 100644
index 0000000000000..0dc01f485f3aa
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pad3d_kernel.cc
@@ -0,0 +1,578 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad3d_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+void ConstPad3DFuncNCDHW(const T* in_data,
+                         T* out_data,
+                         const int in_depth,
+                         const int in_height,
+                         const int in_width,
+                         const int out_depth,
+                         const int out_height,
+                         const int out_width,
+                         const int pad_front,
+                         const int pad_top,
+                         const int pad_left,
+                         const int out_d,
+                         const int out_h,
+                         const int out_w,
+                         const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+       in_h >= in_height || in_w >= in_width)
+          ? value
+          : in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void ConstPad3DFuncNDHWC(const T* in_data,
+                         T* out_data,
+                         const int channels,
+                         const int in_depth,
+                         const int in_height,
+                         const int in_width,
+                         const int out_depth,
+                         const int out_height,
+                         const int out_width,
+                         const int pad_front,
+                         const int pad_top,
+                         const int pad_left,
+                         const int out_d,
+                         const int out_h,
+                         const int out_w,
+                         const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  if (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+      in_h >= in_height || in_w >= in_width) {
+    for (int c = 0; c < channels; ++c) {
+      out_data[out_index + c] = value;
+    }
+  } else {
+    const int in_index =
+        (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+    for (int c = 0; c < channels; ++c) {
+      out_data[out_index + c] = in_data[in_index + c];
+    }
+  }
+}
+
+template <typename T>
+void ReflectPad3DFuncNCDHW(const T* in_data,
+                           T* out_data,
+                           const int in_depth,
+                           const int in_height,
+                           const int in_width,
+                           const int out_depth,
+                           const int out_height,
+                           const int out_width,
+                           const int pad_front,
+                           const int pad_top,
+                           const int pad_left,
+                           const int out_d,
+                           const int out_h,
+                           const int out_w,
+                           const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);                     // reflect by 0
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
+  in_h = std::max(in_h, -in_h);                     // reflect by 0
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+  in_w = std::max(in_w, -in_w);                     // reflect by 0
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void ReflectPad3DFuncNDHWC(const T* in_data,
+                           T* out_data,
+                           const int channels,
+                           const int in_depth,
+                           const int in_height,
+                           const int in_width,
+                           const int out_depth,
+                           const int out_height,
+                           const int out_width,
+                           const int pad_front,
+                           const int pad_top,
+                           const int pad_left,
+                           const int out_d,
+                           const int out_h,
+                           const int out_w,
+                           const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);
+  in_h = std::max(in_h, -in_h);
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);
+  in_w = std::max(in_w, -in_w);
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    out_data[out_index + c] = in_data[in_index + c];
+  }
+}
+
+template <typename T>
+void ReplicatePad3DFuncNCDHW(const T* in_data,
+                             T* out_data,
+                             const int in_depth,
+                             const int in_height,
+                             const int in_width,
+                             const int out_depth,
+                             const int out_height,
+                             const int out_width,
+                             const int pad_front,
+                             const int pad_top,
+                             const int pad_left,
+                             const int out_d,
+                             const int out_h,
+                             const int out_w,
+                             const T value) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void ReplicatePad3DFuncNDHWC(const T* in_data,
+                             T* out_data,
+                             const int channels,
+                             const int in_depth,
+                             const int in_height,
+                             const int in_width,
+                             const int out_depth,
+                             const int out_height,
+                             const int out_width,
+                             const int pad_front,
+                             const int pad_top,
+                             const int pad_left,
+                             const int out_d,
+                             const int out_h,
+                             const int out_w,
+                             const T value) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    out_data[out_index + c] = in_data[in_index + c];
+  }
+}
+
+template <typename T>
+void CircularPad3DFuncNCDHW(const T* in_data,
+                            T* out_data,
+                            const int in_depth,
+                            const int in_height,
+                            const int in_width,
+                            const int out_depth,
+                            const int out_height,
+                            const int out_width,
+                            const int pad_front,
+                            const int pad_top,
+                            const int pad_left,
+                            const int out_d,
+                            const int out_h,
+                            const int out_w,
+                            const T value) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void CircularPad3DFuncNDHWC(const T* in_data,
+                            T* out_data,
+                            const int channels,
+                            const int in_depth,
+                            const int in_height,
+                            const int in_width,
+                            const int out_depth,
+                            const int out_height,
+                            const int out_width,
+                            const int pad_front,
+                            const int pad_top,
+                            const int pad_left,
+                            const int out_d,
+                            const int out_h,
+                            const int out_w,
+                            const T value) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    out_data[out_index + c] = in_data[in_index + c];
+  }
+}
+
+template <typename T>
+void Pad3DNCDHW(const T* in_data,
+                const int num,
+                const int channels,
+                const int in_depth,
+                const int in_height,
+                const int in_width,
+                const int out_depth,
+                const int out_height,
+                const int out_width,
+                const int pad_front,
+                const int pad_top,
+                const int pad_left,
+                T value,
+                T* out_data,
+                void (*pad_func)(const T*,
+                                 T*,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const T)) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_d = 0; out_d < out_depth; ++out_d) {
+        for (int out_h = 0; out_h < out_height; ++out_h) {
+          for (int out_w = 0; out_w < out_width; ++out_w) {
+            pad_func(in_data,
+                     out_data,
+                     in_depth,
+                     in_height,
+                     in_width,
+                     out_depth,
+                     out_height,
+                     out_width,
+                     pad_front,
+                     pad_top,
+                     pad_left,
+                     out_d,
+                     out_h,
+                     out_w,
+                     value);
+          }
+        }
+      }
+      in_data += in_depth * in_height * in_width;
+      out_data += out_depth * out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad3DNDHWC(const T* in_data,
+                const int num,
+                const int channels,
+                const int in_depth,
+                const int in_height,
+                const int in_width,
+                const int out_depth,
+                const int out_height,
+                const int out_width,
+                const int pad_front,
+                const int pad_top,
+                const int pad_left,
+                T value,
+                T* out_data,
+                void (*pad_func)(const T*,
+                                 T*,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const T)) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_d = 0; out_d < out_depth; ++out_d) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          pad_func(in_data,
+                   out_data,
+                   channels,
+                   in_depth,
+                   in_height,
+                   in_width,
+                   out_depth,
+                   out_height,
+                   out_width,
+                   pad_front,
+                   pad_top,
+                   pad_left,
+                   out_d,
+                   out_h,
+                   out_w,
+                   value);
+        }
+      }
+    }
+    in_data += in_depth * in_height * in_width * channels;
+    out_data += out_depth * out_height * out_width * channels;
+  }
+}
+
+template <typename T, typename Context>
+void Pad3dKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const ScalarArray& paddings,
+                 const std::string& mode,
+                 float pad_value,
+                 const std::string& data_format,
+                 DenseTensor* out) {
+  T value = static_cast<T>(pad_value);
+  std::vector<int64_t> pads = paddings.GetData();
+
+  auto in_dims = x.dims();
+  const T* in_data = x.data<T>();
+
+  if (data_format == "NCDHW") {
+    out->Resize({in_dims[0],
+                 in_dims[1],
+                 in_dims[2] + pads[4] + pads[5],
+                 in_dims[3] + pads[2] + pads[3],
+                 in_dims[4] + pads[0] + pads[1]});
+  } else {
+    out->Resize({in_dims[0],
+                 in_dims[1] + pads[4] + pads[5],
+                 in_dims[2] + pads[2] + pads[3],
+                 in_dims[3] + pads[0] + pads[1],
+                 in_dims[4]});
+  }
+
+  auto out_dims = out->dims();
+  T* out_data = dev_ctx.template Alloc<T>(out);
+
+  int channels = in_dims[1];
+  int in_depth = in_dims[2];
+  int in_height = in_dims[3];
+  int in_width = in_dims[4];
+  int out_depth = out_dims[2];
+  int out_height = out_dims[3];
+  int out_width = out_dims[4];
+  if (data_format == "NDHWC") {
+    channels = in_dims[4];
+    in_depth = in_dims[1];
+    in_height = in_dims[2];
+    in_width = in_dims[3];
+    out_depth = out_dims[1];
+    out_height = out_dims[2];
+    out_width = out_dims[3];
+  }
+
+  if (mode == "reflect") {
+    PADDLE_ENFORCE_GT(
+        in_depth,
+        pads[4],
+        errors::InvalidArgument("The depth of Input(X)'s dimension should be "
+                                "greater than pad_front"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_front(%d).",
+                                in_depth,
+                                pads[4]));
+    PADDLE_ENFORCE_GT(
+        in_depth,
+        pads[5],
+        errors::InvalidArgument("The depth of Input(X)'s dimension should be "
+                                "greater than pad_back"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_back(%d).",
+                                in_depth,
+                                pads[5]));
+
+    PADDLE_ENFORCE_GT(
+        in_height,
+        pads[2],
+        errors::InvalidArgument("The height of Input(X)'s dimension should be "
+                                "greater than pad_top"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_top(%d).",
+                                in_height,
+                                pads[2]));
+    PADDLE_ENFORCE_GT(
+        in_height,
+        pads[3],
+        errors::InvalidArgument("The height of Input(X)'s dimension should be "
+                                "greater than pad_bottom"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_bottom(%d).",
+                                in_height,
+                                pads[3]));
+
+    PADDLE_ENFORCE_GT(
+        in_width,
+        pads[0],
+        errors::InvalidArgument("The width of Input(X)'s dimension should be "
+                                "greater than pad_left"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_left(%d).",
+                                in_width,
+                                pads[0]));
+    PADDLE_ENFORCE_GT(
+        in_width,
+        pads[1],
+        errors::InvalidArgument("The width of Input(X)'s dimension should be "
+                                "greater than pad_right"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_right(%d).",
+                                in_width,
+                                pads[1]));
+  } else if (mode == "circular" || mode == "replicate") {
+    PADDLE_ENFORCE_NE(in_depth * in_height * in_width,
+                      0,
+                      errors::InvalidArgument(
+                          "The input tensor size can not be 0 for circular "
+                          "or replicate padding mode."));
+  }
+
+  const int pad_left = pads[0];
+  const int pad_top = pads[2];
+  const int pad_front = pads[4];
+  const int num = in_dims[0];
+  if (data_format == "NCDHW") {
+    std::map<std::string,
+             void (*)(const T*,
+                      T*,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const T)>
+        func_map;
+
+    func_map["reflect"] = ReflectPad3DFuncNCDHW;
+    func_map["replicate"] = ReplicatePad3DFuncNCDHW;
+    func_map["circular"] = CircularPad3DFuncNCDHW;
+    func_map["constant"] = ConstPad3DFuncNCDHW;
+    Pad3DNCDHW(in_data,
+               num,
+               channels,
+               in_depth,
+               in_height,
+               in_width,
+               out_depth,
+               out_height,
+               out_width,
+               pad_front,
+               pad_top,
+               pad_left,
+               value,
+               out_data,
+               func_map[mode]);
+  } else {
+    std::map<std::string,
+             void (*)(const T*,
+                      T*,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const T)>
+        func_map;
+
+    func_map["reflect"] = ReflectPad3DFuncNDHWC;
+    func_map["replicate"] = ReplicatePad3DFuncNDHWC;
+    func_map["circular"] = CircularPad3DFuncNDHWC;
+    func_map["constant"] = ConstPad3DFuncNDHWC;
+    Pad3DNDHWC(in_data,
+               num,
+               channels,
+               in_depth,
+               in_height,
+               in_width,
+               out_depth,
+               out_height,
+               out_width,
+               pad_front,
+               pad_top,
+               pad_left,
+               value,
+               out_data,
+               func_map[mode]);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    pad3d, CPU, ALL_LAYOUT, phi::Pad3dKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/pool_grad_kernel.cc b/paddle/phi/kernels/cpu/pool_grad_kernel.cc
new file mode 100644
index 0000000000000..bb97694d8fc38
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pool_grad_kernel.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pool_grad_kernel.h"
+
+#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    pool2d_grad, CPU, ALL_LAYOUT, phi::Pool2dGradKernel, float, double) {}
+PD_REGISTER_KERNEL(pool2d_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Pool2dDoubleGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(max_pool2d_with_index_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool2dWithIndexGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
+
+PD_REGISTER_KERNEL(
+    pool3d_grad, CPU, ALL_LAYOUT, phi::Pool3dGradKernel, float, double) {}
+PD_REGISTER_KERNEL(max_pool3d_with_index_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool3dWithIndexGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/cpu/pool_kernel.cc b/paddle/phi/kernels/cpu/pool_kernel.cc
new file mode 100644
index 0000000000000..1d57e282c3c8a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pool_kernel.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pool_kernel.h"
+
+#include "paddle/phi/kernels/impl/pool_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(pool2d, CPU, ALL_LAYOUT, phi::Pool2dKernel, float, double) {}
+PD_REGISTER_KERNEL(max_pool2d_with_index,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool2dWithIndexKernel,
+                   float,
+                   double) {
+  kernel->OutputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
+
+PD_REGISTER_KERNEL(pool3d, CPU, ALL_LAYOUT, phi::Pool3dKernel, float, double) {}
+PD_REGISTER_KERNEL(max_pool3d_with_index,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool3dWithIndexKernel,
+                   float,
+                   double) {
+  kernel->OutputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/cpu/prelu_grad_kernel.cc b/paddle/phi/kernels/cpu/prelu_grad_kernel.cc
new file mode 100644
index 0000000000000..97558cdb31f66
--- /dev/null
+++ b/paddle/phi/kernels/cpu/prelu_grad_kernel.cc
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/prelu_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PReluGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& alpha,
+                     const DenseTensor& out_grad,
+                     const std::string& mode,
+                     const std::string& data_format,
+                     DenseTensor* x_grad,
+                     DenseTensor* alpha_grad) {
+  const T* alpha_ptr = alpha.data<T>();
+  const T* x_ptr = x.data<T>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  int numel = x.numel();
+  auto dim = x.dims();
+  int index = 0;
+  int i = 0;
+  if (x_grad) {
+    T* x_grad_ptr = dev_ctx.template Alloc<T>(x_grad);
+    if (mode == "channel") {
+      if (data_format == "NCHW") {
+        int temp = 1;
+        for (int j = 2; j < dim.size(); j++) {
+          temp *= dim[j];
+        }
+        for (i = 0; i < numel; i++) {
+          index = (i / temp) % dim[1];
+          x_grad_ptr[i] = x_ptr[i] > 0 ? out_grad_ptr[i]
+                                       : alpha_ptr[index] * out_grad_ptr[i];
+        }
+      } else {
+        for (i = 0; i < numel; i++) {
+          index = i % dim[dim.size() - 1];
+          x_grad_ptr[i] = x_ptr[i] > 0 ? out_grad_ptr[i]
+                                       : alpha_ptr[index] * out_grad_ptr[i];
+        }
+      }
+    } else if (mode == "element") {
+      int temp = 1;
+      for (int j = 1; j < dim.size(); j++) {
+        temp *= dim[j];
+      }
+      for (i = 0; i < numel; i++) {
+        index = i % temp;
+        x_grad_ptr[i] =
+            x_ptr[i] > 0 ? out_grad_ptr[i] : alpha_ptr[index] * out_grad_ptr[i];
+      }
+    } else {
+      for (i = 0; i < numel; i++) {
+        x_grad_ptr[i] =
+            x_ptr[i] > 0 ? out_grad_ptr[i] : alpha_ptr[0] * out_grad_ptr[i];
+      }
+    }
+  }
+
+  index = 0;
+  if (alpha_grad) {
+    T* alpha_grad_ptr = dev_ctx.template Alloc<T>(alpha_grad);
+    memset(alpha_grad_ptr, 0, sizeof(T) * alpha_grad->numel());
+
+    if (mode == "channel") {
+      if (data_format == "NCHW") {
+        int temp = 1;
+        for (int j = 2; j < dim.size(); j++) {
+          temp *= dim[j];
+        }
+        for (i = 0; i < numel; i++) {
+          index = (i / temp) % dim[1];
+          alpha_grad_ptr[index] +=
+              x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i];
+        }
+      } else {
+        for (i = 0; i < numel; i++) {
+          index = i % dim[dim.size() - 1];
+          alpha_grad_ptr[index] +=
+              x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i];
+        }
+      }
+    } else if (mode == "element") {
+      int temp = 1;
+      for (int j = 1; j < dim.size(); j++) {
+        temp *= dim[j];
+      }
+      for (i = 0; i < numel; i++) {
+        index = i % temp;
+        alpha_grad_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i];
+      }
+    } else {
+      for (i = 0; i < numel; i++) {
+        alpha_grad_ptr[0] += x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i];
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    prelu_grad, CPU, ALL_LAYOUT, phi::PReluGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/prelu_kernel.cc b/paddle/phi/kernels/cpu/prelu_kernel.cc
new file mode 100644
index 0000000000000..8f389ab9ff459
--- /dev/null
+++ b/paddle/phi/kernels/cpu/prelu_kernel.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/prelu_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PReluKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& alpha,
+                 const std::string& mode,
+                 const std::string& data_format,
+                 DenseTensor* out) {
+  const T* x_ptr = x.data<T>();
+  const T* alpha_ptr = alpha.data<T>();
+  T* o_ptr = dev_ctx.template Alloc<T>(out);
+
+  int numel = x.numel();
+  auto dim = x.dims();
+  int index = 0;
+  int i = 0;
+  if (mode == "channel") {
+    if (data_format == "NCHW") {
+      int temp = 1;
+      for (int j = 2; j < dim.size(); j++) {
+        temp *= dim[j];
+      }
+      for (i = 0; i < numel; i++) {
+        index = (i / temp) % dim[1];
+        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
+      }
+    } else {
+      for (i = 0; i < numel; i++) {
+        index = i % dim[dim.size() - 1];
+        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
+      }
+    }
+  } else if (mode == "element") {
+    int temp = 1;
+    for (int j = 1; j < dim.size(); j++) {
+      temp *= dim[j];
+    }
+    for (i = 0; i < numel; i++) {
+      index = i % temp;
+      o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
+    }
+  } else {
+    for (i = 0; i < numel; i++) {
+      o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i];
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(prelu, CPU, ALL_LAYOUT, phi::PReluKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
new file mode 100644
index 0000000000000..fbed3f1cb133a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
@@ -0,0 +1,140 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/psroi_pool_grad_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PsroiPoolGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& rois,
+                         paddle::optional<const DenseTensor&> rois_num,
+                         const DenseTensor& dout,
+                         int pooled_height,
+                         int pooled_width,
+                         int output_channels,
+                         float spatial_scale,
+                         DenseTensor* dx) {
+  if (dx) {
+    auto in_dims = x.dims();
+    int input_channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int rois_num_t = rois.dims()[0];
+
+    // set roi batch id
+    DenseTensor rois_batch_id_list;
+    rois_batch_id_list.Resize({rois_num_t});
+    int* rois_batch_id_data = ctx.template Alloc<int>(&rois_batch_id_list);
+    int rois_batch_size;
+    if (rois_num.get_ptr()) {
+      rois_batch_size = rois_num->numel();
+      auto* rois_num_t_data = rois_num->data<int>();
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_t_data[n]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+        start += rois_num_t_data[n];
+      }
+    } else {
+      auto rois_lod = rois.lod().back();
+      rois_batch_size = rois_lod.size() - 1;
+      // calculate batch id index for each roi according to LoD
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+      }
+    }
+    const T* input_rois = rois.data<T>();
+    const T* dout_data = dout.data<T>();
+    T* dx_data = ctx.template Alloc<T>(dx);
+
+    // set gradient of X to be 0. before backpropagate.
+    funcs::SetConstant<Context, T> set_zero;
+    set_zero(ctx, dx, static_cast<T>(0));
+
+    // backpropagate gradient per output pixel
+    int dout_size = dout.numel();
+    for (int i = 0; i < dout_size; ++i) {
+      // The output is in order (n, c, ph, pw)
+      int pw = i % pooled_width;
+      int ph = (i / pooled_width) % pooled_height;
+      int c = (i / pooled_width / pooled_height) % output_channels;
+      int n = i / pooled_width / pooled_height / output_channels;
+
+      // set roi_batch_id
+      int roi_batch_id = rois_batch_id_data[n];
+      int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+      int input_offset =
+          (roi_batch_id * input_channels + input_channel) * height * width;
+      T* offset_dx_data = dx_data + input_offset;
+
+      // [start, end) interval for spatial sampling
+      const T* offset_input_rois = input_rois + n * 4;
+      T roi_start_w =
+          static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+      T roi_start_h =
+          static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+      T roi_end_w =
+          static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+      T roi_end_h =
+          static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+      // Force too small ROIs to be 1x1
+      T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+      T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
+
+      // Compute w and h at input feature map
+      T bin_size_h = roi_height / static_cast<T>(pooled_height);
+      T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+      int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+      int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+      int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+      int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+      // Add roi offsets and clip to input boundaries
+      hstart = std::min(std::max(hstart, 0), height);
+      hend = std::min(std::max(hend, 0), height);
+      wstart = std::min(std::max(wstart, 0), width);
+      wend = std::min(std::max(wend, 0), width);
+      bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+      // Accumulate diff_val into input data
+      T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+      T diff_val = is_empty ? 0. : dout_data[i] / bin_area;
+      for (int ih = hstart; ih < hend; ++ih) {
+        for (int iw = wstart; iw < wend; ++iw) {
+          int input_index = ih * width + iw;
+          offset_dx_data[input_index] += diff_val;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    psroi_pool_grad, CPU, ALL_LAYOUT, phi::PsroiPoolGradKernel, float, double) {
+  kernel->InputAt(2).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
new file mode 100644
index 0000000000000..06cd03395d965
--- /dev/null
+++ b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/psroi_pool_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PsroiPoolKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& rois,
+                     paddle::optional<const DenseTensor&> rois_num,
+                     int pooled_height,
+                     int pooled_width,
+                     int output_channels,
+                     float spatial_scale,
+                     DenseTensor* out) {
+  auto in_dims = x.dims();
+  int batch_size = in_dims[0];
+  int input_channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int rois_num_t = rois.dims()[0];
+
+  PADDLE_ENFORCE_EQ(input_channels,
+                    output_channels * pooled_height * pooled_width,
+                    errors::InvalidArgument(
+                        "the channels of input "
+                        "X should equal the product of "
+                        "output_channels x pooled_height x pooled_width"));
+
+  auto in_stride = stride(in_dims);
+  auto out_stride = stride(out->dims());
+
+  const T* input_data = x.data<T>();
+
+  DenseTensor rois_batch_id_list;
+  rois_batch_id_list.Resize({rois_num_t});
+  int* rois_batch_id_data = ctx.template Alloc<int>(&rois_batch_id_list);
+
+  int rois_batch_size;
+  if (rois_num.get_ptr()) {
+    rois_batch_size = rois_num->numel();
+    auto* rois_num_data = rois_num->data<int>();
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size,
+        batch_size,
+        errors::InvalidArgument(
+            "The batch size of rois and the batch size of images "
+            " must be the same. But received the batch size of rois is %d, "
+            "and the batch size of images is %d",
+            rois_batch_size,
+            batch_size));
+    int rois_num_count = 0;
+    for (int i = 0; i < rois_batch_size; ++i) {
+      rois_num_count += rois_num_data[i];
+    }
+    PADDLE_ENFORCE_EQ(
+        rois_num_count,
+        rois_num_t,
+        errors::InvalidArgument(
+            "the rois_num from input and RoisNum must be the same"));
+    int start = 0;
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (int i = start; i < start + rois_num_data[n]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+      start += rois_num_data[n];
+    }
+  } else {
+    auto rois_lod = rois.lod().back();
+    rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size,
+        batch_size,
+        errors::InvalidArgument("the rois_batch_size and input(X) "
+                                "batch_size should be the same."));
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num_with_lod,
+                      rois_num_t,
+                      errors::InvalidArgument(
+                          "the rois_num from input and lod must be the same"));
+    // calculate batch id index for each roi according to LoD
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+    }
+  }
+  T* output_data = ctx.template Alloc<T>(out);
+  const T* input_rois = rois.data<T>();
+
+  // calculate psroipooling, parallel processing can be implemented per ROI
+  for (int n = 0; n < rois_num_t; ++n) {
+    // set roi batch id
+    int roi_batch_id = rois_batch_id_data[n];
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+    // Force too small rois to be 1 x 1
+    T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute bin size w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    // calculate each pixel of the output feature map.
+    int out_roi_offset = n * out_stride[0];
+    for (int c = 0; c < output_channels; ++c) {
+      // per category
+      int out_plane_offset = out_roi_offset + c * out_stride[1];
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        int out_row_offset = out_plane_offset + ph * out_stride[2];
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          // calculate w and h at input feature map
+          int hstart = floor(static_cast<T>(ph) * bin_size_h + roi_start_h);
+          int wstart = floor(static_cast<T>(pw) * bin_size_w + roi_start_w);
+          int hend = ceil(static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
+          int wend = ceil(static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
+          //  Add roi offsets and clip to input boundaries
+          hstart = std::min(std::max(hstart, 0), height);
+          wstart = std::min(std::max(wstart, 0), width);
+          hend = std::min(std::max(hend, 0), height);
+          wend = std::min(std::max(wend, 0), width);
+
+          int output_index = out_row_offset + pw;
+          int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+          int input_plane_offset =
+              roi_batch_id * in_stride[0] + input_channel * in_stride[1];
+          const T* offset_input_data = input_data + input_plane_offset;
+          T out_sum = 0.;
+          bool is_empty = (hend <= hstart) || (wend <= wstart);
+          for (int ih = hstart; ih < hend; ++ih) {
+            for (int iw = wstart; iw < wend; ++iw) {
+              int input_index = ih * in_stride[2] + iw;
+              out_sum += offset_input_data[input_index];
+            }
+          }
+          T bin_area = (hend - hstart) * (wend - wstart);
+          output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    psroi_pool, CPU, ALL_LAYOUT, phi::PsroiPoolKernel, float, double) {
+  kernel->InputAt(2).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/cpu/qr_kernel.cc b/paddle/phi/kernels/cpu/qr_kernel.cc
new file mode 100644
index 0000000000000..e2e32567441ae
--- /dev/null
+++ b/paddle/phi/kernels/cpu/qr_kernel.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Eigen/Dense>
+
+#include "paddle/phi/kernels/qr_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+
+namespace phi {
+
+static inline std::tuple<bool, bool> ParseQrMode(const std::string& mode) {
+  bool compute_q;
+  bool reduced;
+  if (mode == "reduced") {
+    compute_q = true;
+    reduced = true;
+  } else if (mode == "complete") {
+    compute_q = true;
+    reduced = false;
+  } else if (mode == "r") {
+    compute_q = false;
+    reduced = true;
+  } else {
+    PADDLE_THROW(errors::InvalidArgument(
+        "QR received unrecognized mode '%s'"
+        " but expected one of 'reduced' (default), 'r', or 'complete'",
+        mode));
+  }
+  return std::make_tuple(compute_q, reduced);
+}
+
+template <typename T, typename Context>
+void QrKernel(const Context& ctx,
+              const DenseTensor& x,
+              const std::string& mode,
+              DenseTensor* q,
+              DenseTensor* r) {
+  bool compute_q;
+  bool reduced_mode;
+  std::tie(compute_q, reduced_mode) = ParseQrMode(mode);
+  auto numel = x.numel();
+  PADDLE_ENFORCE_GT(
+      numel, 0, errors::PreconditionNotMet("The input of QR is empty."));
+  auto x_dims = x.dims();
+  int x_rank = x_dims.size();
+  int m = x_dims[x_rank - 2];
+  int n = x_dims[x_rank - 1];
+  int min_mn = std::min(m, n);
+  int k = reduced_mode ? min_mn : m;
+  int batch_size = numel / (m * n);
+  int x_stride = m * n;
+  int q_stride = m * k;
+  int r_stride = k * n;
+  auto* x_data = x.data<phi::dtype::Real<T>>();
+  T* q_data = nullptr;
+  if (compute_q) {
+    q_data = ctx.template Alloc<phi::dtype::Real<T>>(
+        q, batch_size * m * k * sizeof(phi::dtype::Real<T>));
+  }
+  auto* r_data = ctx.template Alloc<phi::dtype::Real<T>>(
+      r, batch_size * k * n * sizeof(phi::dtype::Real<T>));
+
+  // Implement QR by calling Eigen
+  for (int i = 0; i < batch_size; ++i) {
+    const T* x_matrix_ptr = x_data + i * x_stride;
+    T* r_matrix_ptr = r_data + i * r_stride;
+    using EigenDynamicMatrix =
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    auto x_matrix = Eigen::Map<const EigenDynamicMatrix>(x_matrix_ptr, m, n);
+    Eigen::HouseholderQR<EigenDynamicMatrix> qr(x_matrix);
+    if (reduced_mode) {
+      auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n);
+      auto r_matrix_view =
+          qr_top_matrix.template triangularView<Eigen::Upper>();
+      auto r_matrix = EigenDynamicMatrix(r_matrix_view);
+      memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
+    } else {
+      auto r_matrix_view =
+          qr.matrixQR().template triangularView<Eigen::Upper>();
+      auto r_matrix = EigenDynamicMatrix(r_matrix_view);
+      memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
+    }
+
+    if (compute_q) {
+      T* q_matrix_ptr = q_data + i * q_stride;
+      if (reduced_mode) {
+        auto q_matrix =
+            qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn);
+        q_matrix.transposeInPlace();
+        memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
+      } else {
+        auto q_matrix = qr.householderQ() * EigenDynamicMatrix::Identity(m, m);
+        q_matrix.transposeInPlace();
+        memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(qr, CPU, ALL_LAYOUT, phi::QrKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/reduce.h b/paddle/phi/kernels/cpu/reduce.h
index 4e268d40038cf..af67bdf5d624f 100644
--- a/paddle/phi/kernels/cpu/reduce.h
+++ b/paddle/phi/kernels/cpu/reduce.h
@@ -239,4 +239,29 @@ void Reduce(const DeviceContext& dev_ctx,
   }
 }
 
+template <typename DeviceContext, typename OutT, typename Functor>
+void BoolReduceKernel(const DeviceContext& dev_ctx,
+                      const phi::DenseTensor& input,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      phi::DenseTensor* output) {
+  dev_ctx.template Alloc<OutT>(output);
+
+  // The dims has full dim, set the reduce_all is True
+  const auto& input_dim_size = input.dims().size();
+  std::set<int> dims_set(dims.begin(), dims.end());
+  bool full_dim = true;
+  for (auto i = 0; i < input_dim_size; i++) {
+    if (dims_set.find(i) == dims_set.end()) {
+      full_dim = false;
+      break;
+    }
+  }
+  reduce_all = (reduce_all || full_dim);
+
+  ReduceKernelImpl<DeviceContext, bool, OutT, Functor>(
+      dev_ctx, input, output, dims, keep_dim, reduce_all);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/reduce_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_grad_kernel.cc
new file mode 100644
index 0000000000000..78a7ae8d415b5
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_grad_kernel.cc
@@ -0,0 +1,182 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/reduce_grad.h"
+#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h"
+namespace phi {
+
+template <typename T, typename Context>
+void ComputeFromInput(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& input2,
+                      const std::vector<int64_t>& dims,
+                      DenseTensor* x_grad) {
+  auto* input0 = &x;
+  auto* output = x_grad;
+  dev_ctx.template Alloc<T>(output);
+
+  const auto* input2_d = input2.data<T>();
+  auto* output_d = output->data<T>();
+
+  // handle reduce_all
+  if (input2.dims().size() == 1 && input2.dims()[0] == 1) {
+    for (int64_t i = 0; i < phi::product(input0->dims()); ++i) {
+      output_d[i] = input2_d[0];
+    }
+    return;
+  }
+
+  // handle reduce by one dimension
+  int reduce_dim_index = dims[0];
+  if (reduce_dim_index < 0) {
+    reduce_dim_index += input0->dims().size();
+  }
+
+  auto& input_dim = input0->dims();
+  int64_t before_dim = 1;
+  for (int i = 0; i < reduce_dim_index; ++i) {
+    before_dim *= input_dim[i];
+  }
+  int64_t reduce_dim = input_dim[reduce_dim_index];
+  int64_t after_dim = 1;
+  for (int i = reduce_dim_index + 1; i < input_dim.size(); ++i) {
+    after_dim *= input_dim[i];
+  }
+  for (int64_t i = 0; i < before_dim; ++i) {
+    for (int64_t j = 0; j < reduce_dim; ++j) {
+      for (int64_t k = 0; k < after_dim; ++k) {
+        output_d[i * reduce_dim * after_dim + j * after_dim + k] =
+            input2_d[i * after_dim + k];
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ReduceSumGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad) {
+  if (dims.size() == 1) {
+    if (out_dtype != DataType::UNDEFINED) {
+      DenseTensorMeta x_grad_meta(out_dtype, x_grad->dims(), x_grad->layout());
+      DenseTensor x_grad_tmp =
+          phi::Empty<Context>(dev_ctx, std::move(x_grad_meta));
+
+      ComputeFromInput<T, Context>(dev_ctx, x, out_grad, dims, &x_grad_tmp);
+
+      phi::CastKernel<T>(dev_ctx, x_grad_tmp, in_dtype, x_grad);
+
+    } else {
+      ComputeFromInput<T, Context>(dev_ctx, x, out_grad, dims, x_grad);
+    }
+  }
+
+  ReduceGradKernel<Context, T, funcs::SumGradFunctor, true>(dev_ctx,
+                                                            x,
+                                                            out_grad,
+                                                            paddle::none,
+                                                            dims,
+                                                            keep_dim,
+                                                            reduce_all,
+                                                            in_dtype,
+                                                            out_dtype,
+                                                            x_grad);
+}
+
+template <typename T, typename Context>
+void ReduceMeanGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out_grad,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DataType in_dtype,
+                          DataType out_dtype,
+                          DenseTensor* x_grad) {
+  ReduceGradKernel<Context, T, funcs::MeanGradFunctor, true>(dev_ctx,
+                                                             x,
+                                                             out_grad,
+                                                             paddle::none,
+                                                             dims,
+                                                             keep_dim,
+                                                             reduce_all,
+                                                             in_dtype,
+                                                             out_dtype,
+                                                             x_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sum_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceSumGradKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(mean_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMeanGradKernel,
+                   bool,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(prod_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceProdGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(max_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMaxGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(min_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMinGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/reduce_kernel.cc b/paddle/phi/kernels/cpu/reduce_kernel.cc
new file mode 100644
index 0000000000000..bc99e2cb39a69
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_kernel.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MeanRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::MeanFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void SumRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DataType out_dtype,
+                  DenseTensor* out) {
+  phi::Reduce<CPUContext, T, phi::funcs::SumFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void ProdRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::ProdFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::MaxFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void MinRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::MinFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void AllRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  phi::BoolReduceKernel<CPUContext, T, phi::funcs::AllFunctor>(
+      dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+template <typename T, typename Context>
+void AnyRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  phi::BoolReduceKernel<CPUContext, T, phi::funcs::AnyFunctor>(
+      dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(sum_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SumRawKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+PD_REGISTER_KERNEL(
+    mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {}
+
+PD_REGISTER_KERNEL(prod_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ProdRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(
+    max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
+
+PD_REGISTER_KERNEL(
+    min_raw, CPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {}
+
+PD_REGISTER_KERNEL(all_raw, CPU, ALL_LAYOUT, phi::AllRawKernel, bool) {}
+PD_REGISTER_KERNEL(any_raw, CPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
diff --git a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
new file mode 100644
index 0000000000000..a91b8b6c1fcd3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
@@ -0,0 +1,203 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_align_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <class T>
+void bilinear_interpolate_gradient(const int height,
+                                   const int width,
+                                   T y,
+                                   T x,
+                                   const T out_grad_this_bin,
+                                   const T count,
+                                   T* batch_grad_data) {
+  int x_low, y_low, x_high, y_high;
+  T w1, w2, w3, w4;
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    w1 = w2 = w3 = w4 = 0;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+  y = y <= 0 ? 0 : y;
+  x = x <= 0 ? 0 : x;
+  y_low = static_cast<int>(y);
+  x_low = static_cast<int>(x);
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = static_cast<T>(y_low);
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = static_cast<T>(x_low);
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low, lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+  T diff1 = out_grad_this_bin * w1 / count;
+  T diff2 = out_grad_this_bin * w2 / count;
+  T diff3 = out_grad_this_bin * w3 / count;
+  T diff4 = out_grad_this_bin * w4 / count;
+  if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+    *(batch_grad_data + y_low * width + x_low) += diff1;
+    *(batch_grad_data + y_low * width + x_high) += diff2;
+    *(batch_grad_data + y_high * width + x_low) += diff3;
+    *(batch_grad_data + y_high * width + x_high) += diff4;
+  }
+}
+
+template <typename T, typename Context>
+void RoiAlignGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& boxes,
+                        paddle::optional<const DenseTensor&> boxes_num,
+                        const DenseTensor& out_grad,
+                        int pooled_height,
+                        int pooled_width,
+                        float spatial_scale,
+                        int sampling_ratio,
+                        bool aligned,
+                        DenseTensor* dx) {
+  auto in_dims = x.dims();
+  int channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int rois_num = boxes.dims()[0];
+
+  if (!dx) {
+    return;
+  }
+
+  DenseTensor roi_batch_id_list = Empty<int>(dev_ctx, {rois_num});
+  int* box_batch_id_data = roi_batch_id_list.data<int>();
+
+  int boxes_batch_size;
+  if (boxes_num) {
+    boxes_batch_size = boxes_num->numel();
+    auto* boxes_num_data = boxes_num->data<int>();
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_data[n]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+      start += boxes_num_data[n];
+    }
+  } else {
+    auto boxes_lod = boxes.lod().back();
+    boxes_batch_size = boxes_lod.size() - 1;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (std::size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+    }
+  }
+  dev_ctx.template Alloc<T>(dx);
+
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, dx, static_cast<T>(0));
+
+  int output_grad_size = out_grad.numel();
+
+  if ((!out_grad.IsInitialized()) || (output_grad_size <= 0)) {
+    return;
+  }
+
+  const T* boxes_data = boxes.data<T>();
+  const T* out_grad_data = out_grad.data<T>();
+  T* dx_data = dev_ctx.template Alloc<T>(dx);
+
+  auto in_stride = phi::stride(x.dims());
+  auto roi_stride = phi::stride(boxes.dims());
+  auto out_stride = phi::stride(out_grad.dims());
+
+  T roi_offset = aligned ? T(0.5) : 0;
+  for (int n = 0; n < rois_num; ++n) {
+    int box_batch_idx = box_batch_id_data[n];
+    T roi_xmin = boxes_data[0] * spatial_scale - roi_offset;
+    T roi_ymin = boxes_data[1] * spatial_scale - roi_offset;
+    T roi_xmax = boxes_data[2] * spatial_scale - roi_offset;
+    T roi_ymax = boxes_data[3] * spatial_scale - roi_offset;
+
+    T roi_width = roi_xmax - roi_xmin;
+    T roi_height = roi_ymax - roi_ymin;
+    roi_width = std::max(roi_width, static_cast<T>(1.));
+    roi_height = std::max(roi_height, static_cast<T>(1.));
+    if (!aligned) {
+      roi_width = std::max(roi_width, static_cast<T>(1.));
+      roi_height = std::max(roi_height, static_cast<T>(1.));
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+    for (int c = 0; c < channels; ++c) {
+      T* batch_grad_data =
+          dx_data + box_batch_idx * in_stride[0] + c * in_stride[1];
+      const T* batch_out_grad_data =
+          out_grad_data + n * out_stride[0] + c * out_stride[1];
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int pool_index = ph * pooled_width + pw;
+          T out_grad_this_bin = batch_out_grad_data[pool_index];
+          int roi_bin_grid_h = (sampling_ratio > 0)
+                                   ? sampling_ratio
+                                   : ceil(roi_height / pooled_height);
+          int roi_bin_grid_w = (sampling_ratio > 0)
+                                   ? sampling_ratio
+                                   : ceil(roi_width / pooled_width);
+          T count = roi_bin_grid_h * roi_bin_grid_w;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            const T y = roi_ymin + ph * bin_size_h +
+                        static_cast<T>(iy + .5f) * bin_size_h /
+                            static_cast<T>(roi_bin_grid_h);
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              const T x = roi_xmin + pw * bin_size_w +
+                          static_cast<T>(ix + .5f) * bin_size_w /
+                              static_cast<T>(roi_bin_grid_w);
+              bilinear_interpolate_gradient(height,
+                                            width,
+                                            y,
+                                            x,
+                                            out_grad_this_bin,
+                                            count,
+                                            batch_grad_data);
+            }
+          }
+        }
+      }
+    }
+    boxes_data += roi_stride[0];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roi_align_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RoiAlignGradKernel,
+                   float,
+                   double,
+                   int) {}
diff --git a/paddle/phi/kernels/cpu/roi_align_kernel.cc b/paddle/phi/kernels/cpu/roi_align_kernel.cc
new file mode 100644
index 0000000000000..4752a9b3a48fd
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roi_align_kernel.cc
@@ -0,0 +1,318 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_align_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+
+constexpr size_t GetOffset(size_t x, size_t y, size_t width) {
+  return y * width + x;
+}
+
+template <class T>
+struct OffsetsAndRatios {
+  OffsetsAndRatios() = default;
+  OffsetsAndRatios(std::size_t xy,
+                   std::size_t xY,
+                   std::size_t Xy,
+                   std::size_t XY,
+                   T xy_ratio,
+                   T xY_ratio,
+                   T Xy_ratio,
+                   T XY_ratio)
+      : xy(xy),
+        xY(xY),
+        Xy(Xy),
+        XY(XY),
+        xy_ratio(xy_ratio),
+        xY_ratio(xY_ratio),
+        Xy_ratio(Xy_ratio),
+        XY_ratio(XY_ratio) {}
+
+  std::size_t xy = 0;
+  std::size_t xY = 0;
+  std::size_t Xy = 0;
+  std::size_t XY = 0;
+  T xy_ratio = 0.0f;
+  T xY_ratio = 0.0f;
+  T Xy_ratio = 0.0f;
+  T XY_ratio = 0.0f;
+};
+
+template <typename T>
+std::vector<OffsetsAndRatios<T>> GetIndexesAndRatios(
+    std::size_t width,
+    std::size_t height,
+    const T roi_width,
+    const T roi_height,
+    const T roi_xmin,
+    const T roi_ymin,
+    std::size_t pooled_width,
+    std::size_t roi_bin_grid_w,
+    std::size_t pooled_height,
+    std::size_t roi_bin_grid_h) {
+  const auto ind_num =
+      pooled_width * roi_bin_grid_w * pooled_height * roi_bin_grid_h;
+
+  std::vector<OffsetsAndRatios<T>> interpolation_cords;
+  interpolation_cords.reserve(ind_num);
+
+  const auto bin_w = roi_width / pooled_width;
+  const auto bin_h = roi_height / pooled_height;
+
+  for (std::size_t py = 0; py < pooled_height; py++) {
+    for (std::size_t px = 0; px < pooled_width; px++) {
+      for (std::size_t iy = 0; iy < roi_bin_grid_h; iy++) {
+        // calculate x of sample points
+        auto y =
+            roi_ymin +
+            bin_h * (py +
+                     static_cast<T>(iy + .5f) / static_cast<T>(roi_bin_grid_h));
+        for (std::size_t ix = 0; ix < roi_bin_grid_w; ix++) {
+          // calculate x of sample points
+          auto x = roi_xmin +
+                   bin_w * (px +
+                            static_cast<T>(ix + .5f) /
+                                static_cast<T>(roi_bin_grid_w));
+
+          // deal with elements out of map
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            interpolation_cords.emplace_back();
+            continue;
+          }
+          y = y <= 0 ? 0 : y;
+          x = x <= 0 ? 0 : x;
+
+          std::size_t x_low_index = static_cast<std::size_t>(x);
+          std::size_t x_high_index;
+          if (x_low_index >= width - 1) {
+            x_high_index = x_low_index = width - 1;
+            x = static_cast<T>(x_low_index);
+          } else {
+            x_high_index = x_low_index + 1;
+          }
+          T x_ratio = x_high_index - x;
+
+          std::size_t y_low_index = static_cast<std::size_t>(y);
+          std::size_t y_high_index;
+          if (y_low_index >= height - 1) {
+            y_high_index = y_low_index = height - 1;
+            y = static_cast<T>(y_low_index);
+          } else {
+            y_high_index = y_low_index + 1;
+          }
+          T y_ratio = y_high_index - y;
+
+          auto xy = GetOffset(x_low_index, y_low_index, width);
+          auto xY = GetOffset(x_low_index, y_high_index, width);
+          auto Xy = GetOffset(x_high_index, y_low_index, width);
+          auto XY = GetOffset(x_high_index, y_high_index, width);
+
+          auto xy_ratio = x_ratio * y_ratio;
+          auto xY_ratio = x_ratio * (1 - y_ratio);
+          auto Xy_ratio = (1 - x_ratio) * y_ratio;
+          auto XY_ratio = (1 - x_ratio) * (1 - y_ratio);
+
+          interpolation_cords.emplace_back(
+              xy, xY, Xy, XY, xy_ratio, xY_ratio, Xy_ratio, XY_ratio);
+        }
+      }
+    }
+  }
+  return interpolation_cords;
+}
+
+template <typename T>
+void Interpolate(std::vector<T>& interpolated_values,  // NOLINT
+                 const std::vector<OffsetsAndRatios<T>>& interpolation_cords,
+                 const T* data) {
+  for (auto& ic : interpolation_cords) {
+    auto xlyl_offset = ic.xy;
+    auto xhyl_offset = ic.Xy;
+    auto xlyh_offset = ic.xY;
+    auto xhyh_offset = ic.XY;
+
+    auto xlyl_ratio = ic.xy_ratio;
+    auto xhyl_ratio = ic.Xy_ratio;
+    auto xlyh_ratio = ic.xY_ratio;
+    auto xhyh_ratio = ic.XY_ratio;
+
+    interpolated_values.emplace_back(
+        xlyl_ratio * data[xlyl_offset] + xhyl_ratio * data[xhyl_offset] +
+        xlyh_ratio * data[xlyh_offset] + xhyh_ratio * data[xhyh_offset]);
+  }
+}
+
+template <typename T>
+void AvgPool(const std::vector<T>& interpolated_values,
+             T* output_data,
+             int roi_bin_grid_w,
+             int roi_bin_grid_h,
+             int pooled_width,
+             int pooled_height) {
+  const auto data_amount = pooled_width * pooled_height;
+  const auto grid_points = roi_bin_grid_w * roi_bin_grid_h;
+  const T count = 1.0 / grid_points;
+  auto val_begin = interpolated_values.cbegin();
+  for (auto i = 0; i < data_amount; ++i) {
+    T sum = 0.0;
+    auto val_end = val_begin + grid_points;
+    sum = std::accumulate(val_begin, val_end, sum);
+    val_begin = val_end;
+    output_data[i] = sum * count;
+  }
+}
+
+template <typename T, typename Context>
+void RoiAlignKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& boxes,
+                    paddle::optional<const DenseTensor&> boxes_num,
+                    int pooled_height,
+                    int pooled_width,
+                    float spatial_scale,
+                    int sampling_ratio,
+                    bool aligned,
+                    DenseTensor* out) {
+  auto in_dims = x.dims();
+  int batch_size = in_dims[0];
+  int channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int rois_num = boxes.dims()[0];
+
+  auto in_stride = phi::stride(in_dims);
+  auto roi_stride = phi::stride(boxes.dims());
+  auto out_stride = phi::stride(out->dims());
+
+  const T* input_data = x.data<T>();
+  DenseTensor roi_batch_id_list = Empty<int>(dev_ctx, {rois_num});
+  int* roi_batch_id_data = roi_batch_id_list.data<int>();
+  int boxes_batch_size;
+  if (boxes_num) {
+    boxes_batch_size = boxes_num->numel();
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        errors::InvalidArgument(
+            "The batch size of rois and the batch size of images "
+            " must be the same. But received the batch size of rois is %d, "
+            "and the batch size of images is %d",
+            boxes_batch_size,
+            batch_size));
+    auto* boxes_num_data = boxes_num->data<int>();
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_data[n]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+      start += boxes_num_data[n];
+    }
+  } else {
+    auto lod = boxes.lod();
+    PADDLE_ENFORCE_EQ(
+        lod.empty(),
+        false,
+        errors::InvalidArgument("Input(ROIs) Tensor of ROIAlignOp "
+                                "does not contain LoD information."));
+    auto boxes_lod = lod.back();
+    int boxes_batch_size = boxes_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        errors::InvalidArgument(
+            "The boxes_batch_size and imgs "
+            "batch_size must be the same. But received boxes_batch_size = %d, "
+            "batch_size = %d",
+            boxes_batch_size,
+            batch_size));
+    int boxes_num_with_lod = boxes_lod[boxes_batch_size];
+    PADDLE_ENFORCE_EQ(
+        rois_num,
+        boxes_num_with_lod,
+        errors::InvalidArgument(
+            "The actual number of rois and the number of rois "
+            "provided from Input(RoIsLoD) in RoIAlign must be the same."
+            " But received actual number of rois is %d, and the number "
+            "of rois from RoIsLoD is %d",
+            rois_num,
+            boxes_num_with_lod));
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (std::size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+    }
+  }
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  const T* boxes_data = boxes.data<T>();
+  T roi_offset = aligned ? T(0.5) : 0;
+  for (int n = 0; n < rois_num; ++n) {
+    int roi_batch_id = roi_batch_id_data[n];
+    T roi_xmin = boxes_data[0] * spatial_scale - roi_offset;
+    T roi_ymin = boxes_data[1] * spatial_scale - roi_offset;
+    T roi_xmax = boxes_data[2] * spatial_scale - roi_offset;
+    T roi_ymax = boxes_data[3] * spatial_scale - roi_offset;
+
+    T roi_width = roi_xmax - roi_xmin;
+    T roi_height = roi_ymax - roi_ymin;
+    if (!aligned) {
+      roi_width = std::max(roi_width, static_cast<T>(1.));
+      roi_height = std::max(roi_height, static_cast<T>(1.));
+    }
+
+    const T* batch_data = input_data + roi_batch_id * in_stride[0];
+
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    auto interpolation_cords = GetIndexesAndRatios(width,
+                                                   height,
+                                                   roi_width,
+                                                   roi_height,
+                                                   roi_xmin,
+                                                   roi_ymin,
+                                                   pooled_width,
+                                                   roi_bin_grid_w,
+                                                   pooled_height,
+                                                   roi_bin_grid_h);
+
+    std::vector<T> interpolated_values;
+    interpolated_values.reserve(interpolation_cords.size());
+    for (auto channel = 0; channel < channels; ++channel) {
+      Interpolate(interpolated_values, interpolation_cords, batch_data);
+      AvgPool(interpolated_values,
+              output_data,
+              roi_bin_grid_w,
+              roi_bin_grid_h,
+              pooled_width,
+              pooled_height);
+      batch_data += in_stride[1];
+      output_data += out_stride[1];
+      interpolated_values.clear();
+    }
+    boxes_data += roi_stride[0];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_align, CPU, ALL_LAYOUT, phi::RoiAlignKernel, float, double, int) {}
diff --git a/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
new file mode 100644
index 0000000000000..0eaa873590eb0
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_pool_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiPoolGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& boxes,
+                       paddle::optional<const DenseTensor&> boxes_num,
+                       const DenseTensor& arg_max,
+                       const DenseTensor& out_grad,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       DenseTensor* dx) {
+  if (dx) {
+    int rois_num = boxes.dims()[0];
+    DenseTensor box_batch_id_list = Empty<int>(dev_ctx, {rois_num});
+    int* box_batch_id_data = box_batch_id_list.data<int>();
+
+    int boxes_batch_size;
+    if (boxes_num) {
+      boxes_batch_size = boxes_num->numel();
+      auto* boxes_num_data = boxes_num->data<int>();
+      int start = 0;
+      for (int n = 0; n < boxes_batch_size; ++n) {
+        for (int i = start; i < start + boxes_num_data[n]; ++i) {
+          box_batch_id_data[i] = n;
+        }
+        start += boxes_num_data[n];
+      }
+    } else {
+      auto boxes_lod = boxes.lod().back();
+      boxes_batch_size = boxes_lod.size() - 1;
+      for (int n = 0; n < boxes_batch_size; ++n) {
+        for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+          box_batch_id_data[i] = n;
+        }
+      }
+    }
+
+    const T* boxes_data = boxes.data<T>();
+    const T* out_grad_data = out_grad.data<T>();
+    const int64_t* arg_max_data = arg_max.data<int64_t>();
+    T* dx_data = dev_ctx.template Alloc<T>(dx);
+
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, dx, static_cast<T>(0));
+
+    auto in_stride = phi::stride(x.dims());
+    auto arg_max_stride = phi::stride(arg_max.dims());
+    auto roi_stride = phi::stride(boxes.dims());
+    auto out_stride = phi::stride(out_grad.dims());
+
+    int channels = x.dims()[1];
+
+    for (int n = 0; n < rois_num; ++n) {
+      int roi_batch_idx = box_batch_id_data[n];
+      T* batch_grad_data = dx_data + roi_batch_idx * in_stride[0];
+      for (int c = 0; c < channels; ++c) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int pool_index = ph * pooled_width + pw;
+            if (arg_max_data[pool_index] >= 0) {
+              auto index = arg_max_data[pool_index];
+              batch_grad_data[index] += out_grad_data[pool_index];
+            }
+          }
+        }
+        batch_grad_data += in_stride[1];
+        out_grad_data += out_stride[1];
+        arg_max_data += arg_max_stride[1];
+      }
+      boxes_data += roi_stride[0];
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roi_pool_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RoiPoolGradKernel,
+                   float,
+                   double,
+                   int) {
+  kernel->InputAt(3).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/phi/kernels/cpu/roi_pool_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_kernel.cc
new file mode 100644
index 0000000000000..02020354cd357
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roi_pool_kernel.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_pool_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiPoolKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& boxes,
+                   paddle::optional<const DenseTensor&> boxes_num,
+                   int pooled_height,
+                   int pooled_width,
+                   float spatial_scale,
+                   DenseTensor* out,
+                   DenseTensor* arg_max) {
+  auto x_dims = x.dims();
+  int batch_size = x_dims[0];
+  int channels = x_dims[1];
+  int height = x_dims[2];
+  int width = x_dims[3];
+  int rois_num = boxes.dims()[0];
+
+  auto in_stride = phi::stride(x_dims);
+  auto arg_max_stride = phi::stride(arg_max->dims());
+  auto box_stride = phi::stride(boxes.dims());
+  auto out_stride = phi::stride(out->dims());
+
+  const T* input_data = x.data<T>();
+
+  DenseTensor box_batch_id_list = Empty<int>(dev_ctx, {rois_num});
+  int* box_batch_id_data = box_batch_id_list.data<int>();
+
+  int boxes_batch_size;
+  if (boxes_num) {
+    boxes_batch_size = boxes_num->numel();
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        phi::errors::InvalidArgument("The boxes_batch_size and imgs "
+                                     "batch_size must be the same."));
+    auto* boxes_num_data = boxes_num->data<int>();
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_data[n]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+      start += boxes_num_data[n];
+    }
+  } else {
+    auto boxes_lod = boxes.lod().back();
+    boxes_batch_size = boxes_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        phi::errors::InvalidArgument("The boxes_batch_size and imgs "
+                                     "batch_size must be the same."));
+    int rois_num_with_lod = boxes_lod[boxes_batch_size];
+    PADDLE_ENFORCE_EQ(
+        rois_num,
+        rois_num_with_lod,
+        phi::errors::InvalidArgument("The rois_num from input "
+                                     "and lod must be the same."));
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+    }
+  }
+
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* arg_max_data = dev_ctx.template Alloc<int64_t>(arg_max);
+
+  const T* boxes_data = boxes.data<T>();
+  for (int n = 0; n < rois_num; ++n) {
+    int box_batch_id = box_batch_id_data[n];
+    int box_start_w = round(boxes_data[0] * spatial_scale);
+    int box_start_h = round(boxes_data[1] * spatial_scale);
+    int box_end_w = round(boxes_data[2] * spatial_scale);
+    int box_end_h = round(boxes_data[3] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    int box_height = std::max(box_end_h - box_start_h + 1, 1);
+    int box_width = std::max(box_end_w - box_start_w + 1, 1);
+
+    const float bin_size_h =
+        static_cast<float>(box_height) / static_cast<float>(pooled_height);
+    const float bin_size_w =
+        static_cast<float>(box_width) / static_cast<float>(pooled_width);
+
+    const T* batch_data = input_data + box_batch_id * in_stride[0];
+
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          //  Compute pooling region for this output unit:
+          //  start (included) = floor(ph * box_height / pooled_height_)
+          //  end (excluded) = ceil((ph + 1) * box_height / pooled_height_)
+          int hstart =
+              static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
+          int wstart =
+              static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
+          int hend =
+              static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
+          int wend =
+              static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));
+
+          hstart = std::min(std::max(hstart + box_start_h, 0), height);
+          hend = std::min(std::max(hend + box_start_h, 0), height);
+          wstart = std::min(std::max(wstart + box_start_w, 0), width);
+          wend = std::min(std::max(wend + box_start_w, 0), width);
+
+          const int pool_index = ph * pooled_width + pw;
+
+          // Define an empty pooling region to be zero
+          bool is_empty = (hend <= hstart) || (wend <= wstart);
+          output_data[pool_index] =
+              is_empty ? 0 : -std::numeric_limits<T>::max();
+          arg_max_data[pool_index] = -1;
+
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int index = h * width + w;
+              if (batch_data[index] > output_data[pool_index]) {
+                output_data[pool_index] = batch_data[index];
+                arg_max_data[pool_index] = index;
+              }
+            }
+          }
+        }
+      }
+
+      batch_data += in_stride[1];
+      output_data += out_stride[1];
+      arg_max_data += arg_max_stride[1];
+    }
+    // Increment ROI data pointer
+    boxes_data += box_stride[0];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_pool, CPU, ALL_LAYOUT, phi::RoiPoolKernel, float, double, int) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/phi/kernels/cpu/roll_grad_kernel.cc b/paddle/phi/kernels/cpu/roll_grad_kernel.cc
new file mode 100644
index 0000000000000..b0d0c0663e4a2
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roll_grad_kernel.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roll_grad_kernel.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/roll_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RollGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& shifts,
+                    const std::vector<int64_t>& axis,
+                    DenseTensor* x_grad) {
+  std::vector<T> out_vec;
+  paddle::framework::TensorToVector(out_grad, dev_ctx, &out_vec);
+
+  auto shifts_data = shifts.GetData();
+  size_t nums = shifts_data.size();
+  DDim input_dim = out_grad.dims();
+  auto dims = axis;
+
+  // axis = none, reshape to 1-D tensor
+  if (dims.size() == 0) {
+    dims.push_back(0l);
+    input_dim = phi::Dim<1>(out_vec.size());
+  }
+
+  for (size_t i = 0; i < nums; i++) {
+    ShiftAlongDim(out_vec.data(), input_dim, dims[i], 0 - shifts_data[i]);
+  }
+
+  dev_ctx.template Alloc<T>(x_grad);
+  paddle::framework::TensorFromVector(out_vec, dev_ctx, x_grad);
+  x_grad->Resize(out_grad.dims());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roll_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RollGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/roll_kernel.cc b/paddle/phi/kernels/cpu/roll_kernel.cc
new file mode 100644
index 0000000000000..25b64ef257dfb
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roll_kernel.cc
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roll_kernel.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/roll_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RollKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& shifts,
+                const std::vector<int64_t>& axis,
+                DenseTensor* out) {
+  std::vector<T> out_vec;
+  paddle::framework::TensorToVector(x, dev_ctx, &out_vec);
+
+  auto shifts_data = shifts.GetData();
+  size_t nums = shifts_data.size();
+  DDim input_dim = x.dims();
+  auto dims = axis;
+
+  // axis = none, reshape to 1-D tensor
+  if (dims.size() == 0) {
+    dims.push_back(0l);
+    input_dim = phi::Dim<1>(out_vec.size());
+  }
+
+  for (size_t i = 0; i < nums; i++) {
+    PADDLE_ENFORCE_EQ(
+        dims[i] < input_dim.size() && dims[i] >= (0 - input_dim.size()),
+        true,
+        phi::errors::OutOfRange(
+            "Attr(axis[%d]) is out of range, It's expected "
+            "to be in range of [-%d, %d]. But received Attr(axis[%d]) = %d.",
+            i,
+            input_dim.size(),
+            input_dim.size() - 1,
+            i,
+            dims[i]));
+    ShiftAlongDim(out_vec.data(), input_dim, dims[i], shifts_data[i]);
+  }
+  dev_ctx.template Alloc<T>(out);
+  paddle::framework::TensorFromVector(out_vec, dev_ctx, out);
+  out->Resize(x.dims());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roll,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RollKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/roll_kernel_impl.h b/paddle/phi/kernels/cpu/roll_kernel_impl.h
new file mode 100644
index 0000000000000..924e71aff31f3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roll_kernel_impl.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T>
+inline void ShiftAlongDim(T* data,
+                          const DDim& input_dim,
+                          int64_t dim,
+                          int64_t shift) {
+  if (dim < 0) {
+    dim += input_dim.size();
+  }
+  if (input_dim[dim] == 0) {
+    return;
+  }
+  shift = shift % input_dim[dim];
+  if (shift < 0) {
+    shift += input_dim[dim];
+  }
+
+  auto outer_loops = 1;
+  for (auto i = 0; i < dim; i++) {
+    outer_loops *= input_dim[i];
+  }
+  auto slice_width = 1;
+  for (auto i = dim + 1; i < input_dim.size(); i++) {
+    slice_width *= input_dim[i];
+  }
+
+  VLOG(3) << "shift_along_dim_debug: input_dim: " << input_dim
+          << "; dim: " << dim << "; shift: " << shift
+          << "; outer_loops: " << outer_loops
+          << "; slice_width: " << slice_width;
+  if (shift == 0) {
+    return;
+  }
+
+  std::vector<T> head;
+  auto head_size = slice_width * (input_dim[dim] - shift);
+  head.resize(head_size);
+
+  for (auto i = 0; i < outer_loops; i++) {
+    for (auto j = 0; j < head_size; j++) {
+      head[j] = data[i * input_dim[dim] * slice_width + j];
+    }
+    for (auto j = input_dim[dim] - shift; j < input_dim[dim]; j++) {
+      auto dst_pos = j - input_dim[dim] + shift;
+      for (auto k = 0; k < slice_width; k++) {
+        data[(i * input_dim[dim] + dst_pos) * slice_width + k] =
+            data[(i * input_dim[dim] + j) * slice_width + k];
+      }
+    }
+    for (auto j = 0; j < head_size; j++) {
+      data[(i * input_dim[dim] + shift) * slice_width + j] = head[j];
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/searchsorted_kernel.cc b/paddle/phi/kernels/cpu/searchsorted_kernel.cc
new file mode 100644
index 0000000000000..c036c2d438a36
--- /dev/null
+++ b/paddle/phi/kernels/cpu/searchsorted_kernel.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/searchsorted_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/searchsorted_kernel_impl.h"
+
+PD_REGISTER_KERNEL(searchsorted,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SearchsortedKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
new file mode 100644
index 0000000000000..a5c9dc4c55e49
--- /dev/null
+++ b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/segment_pool_grad_kernel.h"
+#include "paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(segment_pool_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SegmentPoolGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/segment_pool_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_kernel.cc
new file mode 100644
index 0000000000000..ad76a7a86bcb2
--- /dev/null
+++ b/paddle/phi/kernels/cpu/segment_pool_kernel.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/segment_pool_kernel.h"
+#include "paddle/phi/kernels/impl/segment_pool_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(segment_pool,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SegmentPoolKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/set_value_grad_kernel.cc b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc
new file mode 100644
index 0000000000000..44df36bb9fd87
--- /dev/null
+++ b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/set_value_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/set_value_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(set_value_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SetValueGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/cpu/shard_index_kernel.cc b/paddle/phi/kernels/cpu/shard_index_kernel.cc
new file mode 100644
index 0000000000000..a82bb8ce5929d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/shard_index_kernel.cc
@@ -0,0 +1,91 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/shard_index_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ShardIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& in,
+                      int index_num,
+                      int nshards,
+                      int shard_id,
+                      int ignore_value,
+                      DenseTensor* out) {
+  PADDLE_ENFORCE_GT(
+      index_num,
+      0,
+      errors::InvalidArgument(
+          "The value 'index_num' for Op(shard_index) must be greater than 0, "
+          "but the value given is %d.",
+          index_num));
+  PADDLE_ENFORCE_GT(
+      nshards,
+      0,
+      errors::InvalidArgument("The value 'nshard' for Op(shard_index) must be "
+                              "greater than 0, but the value given is %d.",
+                              nshards));
+  PADDLE_ENFORCE_GE(
+      shard_id,
+      0,
+      errors::InvalidArgument(
+          "The value 'shard_id' for Op(shard_index) must be greater or "
+          "equal to 0, but the value given is %d.",
+          shard_id));
+  PADDLE_ENFORCE_LT(
+      shard_id,
+      nshards,
+      errors::InvalidArgument(
+          "The value 'shard_id' for Op(shard_index) must be less than "
+          "nshards (%d), but the value given is %d.",
+          nshards,
+          shard_id));
+
+  int shard_size = (index_num + nshards - 1) / nshards;
+
+  out->Resize(in.dims());
+  out->set_lod(in.lod());
+  auto* in_data = in.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+  int64_t numel = in.numel();
+  for (int64_t i = 0; i < numel; ++i) {
+    PADDLE_ENFORCE_GE(in_data[i],
+                      0,
+                      errors::InvalidArgument(
+                          "The input_index for Op(shard_index) must be "
+                          "greater or equal to 0, but the value given is %d.",
+                          in_data[i]));
+    PADDLE_ENFORCE_LT(in_data[i],
+                      index_num,
+                      errors::InvalidArgument(
+                          "The input_index for Op(shard_index) must be less "
+                          "than index_num (%d), but the value given is %d.",
+                          index_num,
+                          in_data[i]));
+    if (in_data[i] / shard_size == shard_id) {
+      out_data[i] = in_data[i] % shard_size;
+    } else {
+      out_data[i] = ignore_value;
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    shard_index, CPU, ALL_LAYOUT, phi::ShardIndexKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/softmax_kernel.cc b/paddle/phi/kernels/cpu/softmax_kernel.cc
index 537b4326681a1..1d28669571f8d 100644
--- a/paddle/phi/kernels/cpu/softmax_kernel.cc
+++ b/paddle/phi/kernels/cpu/softmax_kernel.cc
@@ -19,4 +19,4 @@ limitations under the License. */
 #include "paddle/phi/kernels/impl/softmax_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
-    softmax, CPU, ALL_LAYOUT, phi::SoftmaxRawKernel, float, double) {}
+    softmax, CPU, ALL_LAYOUT, phi::SoftmaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc
index 324798effbe56..ea8e2702c19d6 100644
--- a/paddle/phi/kernels/cpu/split_kernel.cc
+++ b/paddle/phi/kernels/cpu/split_kernel.cc
@@ -38,7 +38,7 @@ void SplitKernel(const Context& dev_ctx,
       out_metas_ptr.push_back(&out_metas.back());
     }
 
-    phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr, true);
+    phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr);
 
     for (size_t i = 0; i < out_metas.size(); ++i) {
       outs[i]->Resize(out_metas[i].dims());
diff --git a/paddle/phi/kernels/cpu/tile_grad_kernel.cc b/paddle/phi/kernels/cpu/tile_grad_kernel.cc
new file mode 100644
index 0000000000000..636ade93742da
--- /dev/null
+++ b/paddle/phi/kernels/cpu/tile_grad_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tile_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tile_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(tile_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TileGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/tile_kernel.cc b/paddle/phi/kernels/cpu/tile_kernel.cc
new file mode 100644
index 0000000000000..3b590ed475aa2
--- /dev/null
+++ b/paddle/phi/kernels/cpu/tile_kernel.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tile_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tile_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    tile, CPU, ALL_LAYOUT, phi::TileKernel, bool, float, double, int, int64_t) {
+}
diff --git a/paddle/phi/kernels/cpu/top_k_grad_kernel.cc b/paddle/phi/kernels/cpu/top_k_grad_kernel.cc
new file mode 100644
index 0000000000000..582ee1157cce8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/top_k_grad_kernel.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/top_k_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Type>
+static void FullTopKAssign(const Type& input_height,
+                           const Type& input_width,
+                           const int& input_dim,
+                           const DenseTensor* input,
+                           const DenseTensor* indices,
+                           T* output_data,
+                           const int& k) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      for (Type j = 0; j < k; ++j) {
+        output_data[i * input_width + e_indices(j)] = e_input(j);
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      for (Type j = 0; j < k; ++j) {
+        output_data[i * input_width + e_indices(i, j)] = e_input(i, j);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void TopkGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_grad,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    int k,
+                    int axis,
+                    bool largest,
+                    bool sorted,
+                    DenseTensor* x_grad) {
+  const auto& in_dims = x.dims();
+  const auto& out_dims = indices.dims();
+
+  // axis < 0, get the real axis
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+  if (axis + 1 == in_dims.size()) {
+    // allocate the memory for the input_grad
+
+    // assign the out_grad to input_grad directly
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+
+    // init the output grad with 0, because some input elements has no grad
+    memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
+    // Assign the output_grad to input_grad
+    FullTopKAssign(input_height,
+                   input_width,
+                   in_dims.size(),
+                   &out_grad,
+                   &indices,
+                   x_grad_data,
+                   k);
+  } else {
+    // can not assign grad to input_grad, must do the transpose
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(out_dims.size() - 1);
+    for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+    phi::DDim trans_dims(out_dims);
+    phi::DDim trans_in_dims(in_dims);
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = out_dims[trans[i]];
+      trans_in_dims[i] = in_dims[trans[i]];
+    }
+    // transpose the out_grad, indices
+    DenseTensor trans_dO;
+    DenseTensor trans_ind;
+    trans_dO.Resize(trans_dims);
+    trans_ind.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_dO);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    int ndims = trans.size();
+
+    // Do transpose
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, out_grad, &trans_dO, trans);
+    funcs::TransCompute<phi::CPUContext, int64_t>(
+        ndims, dev_ctx, indices, &trans_ind, trans);
+    const int64_t input_height = phi::product(
+        phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
+    const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
+
+    // Assign the out_grad to tranpose input_grad
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_in_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    memset(t_out, 0, x_grad->numel() * sizeof(T));
+
+    FullTopKAssign<T, int64_t>(input_height,
+                               input_width,
+                               in_dims.size(),
+                               &trans_dO,
+                               &trans_ind,
+                               t_out,
+                               k);
+
+    // Transpose back
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, tmp_out, x_grad, trans);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(top_k_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TopkGradKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/top_k_kernel.cc b/paddle/phi/kernels/cpu/top_k_kernel.cc
new file mode 100644
index 0000000000000..4ac16667ce274
--- /dev/null
+++ b/paddle/phi/kernels/cpu/top_k_kernel.cc
@@ -0,0 +1,230 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/top_k_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Type>
+static void FullTopK(Type input_height,
+                     Type input_width,
+                     int input_dim,
+                     const DenseTensor* input,
+                     T* t_out,
+                     Type* t_indices,
+                     const int& k,
+                     const bool& largest,
+                     const bool& sorted) {
+  // when the k is small, will the partial sort
+  bool partial_sort_flag = (k * 64) < input_width;
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    if (partial_sort_flag) {
+      std::partial_sort(
+          col_vec.begin(),
+          col_vec.begin() + k,
+          col_vec.end(),
+          [&largest](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+            if (largest) {
+              return (std::isnan(static_cast<double>(l.first)) &&
+                      !std::isnan(static_cast<double>(r.first))) ||
+                     (l.first > r.first);
+            } else {
+              return (!std::isnan(static_cast<double>(l.first)) &&
+                      std::isnan(static_cast<double>(r.first))) ||
+                     (l.first < r.first);
+            }
+          });
+    } else {
+      // use the nth-element to get the K-larger or K-small element
+      if (largest) {
+        std::nth_element(
+            col_vec.begin(),
+            col_vec.begin() + k - 1,
+            col_vec.end(),
+            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+              return (std::isnan(static_cast<double>(l.first)) &&
+                      !std::isnan(static_cast<double>(r.first))) ||
+                     (l.first > r.first);
+            });
+        // the nth-element will get the unorder elements, sort the element
+        if (sorted) {
+          std::sort(col_vec.begin(),
+                    col_vec.begin() + k - 1,
+                    [&largest](const std::pair<T, Type>& l,
+                               const std::pair<T, Type>& r) {
+                      return (std::isnan(static_cast<double>(l.first)) &&
+                              !std::isnan(static_cast<double>(r.first))) ||
+                             (l.first > r.first);
+                    });
+        }
+      } else {
+        std::nth_element(
+            col_vec.begin(),
+            col_vec.begin() + k - 1,
+            col_vec.end(),
+            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+              return (!std::isnan(static_cast<double>(l.first)) &&
+                      std::isnan(static_cast<double>(r.first))) ||
+                     (l.first < r.first);
+            });
+        // the nth-element will get the unorder elements, sort the element
+        if (sorted) {
+          std::sort(
+              col_vec.begin(),
+              col_vec.begin() + k - 1,
+              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                return (!std::isnan(static_cast<double>(l.first)) &&
+                        std::isnan(static_cast<double>(r.first))) ||
+                       (l.first < r.first);
+              });
+        }
+      }
+    }
+    for (Type j = 0; j < k; ++j) {
+      t_out[i * k + j] = col_vec[j].first;
+      t_indices[i * k + j] = col_vec[j].second;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void TopkKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const Scalar& k_scalar,
+                int axis,
+                bool largest,
+                bool sorted,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  const auto* input = &x;
+  // Get the top k elements of each row of input tensor
+  const auto& in_dims = input->dims();
+
+  // axis < 0, cacluate the real axis
+  if (axis < 0) {
+    axis += in_dims.size();
+  }
+
+  int k = k_scalar.to<int>();
+  if (k_scalar.FromTensor()) {
+    auto out_dims = out->dims();
+    // accroding to axis to set K value in the dim
+    out_dims[axis] = k;
+    out->Resize(out_dims);
+    indices->Resize(out_dims);
+  }
+
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+  const auto& out_dims = out->dims();
+  if (axis + 1 == in_dims.size()) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    FullTopK<T, int64_t>(input_height,
+                         input_width,
+                         in_dims.size(),
+                         input,
+                         out_data,
+                         indices_data,
+                         k,
+                         largest,
+                         sorted);
+  } else {
+    // if the topk dims is not last dim, will tranpose and do topk
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+
+    // get the trans input_dims, out_dims
+    phi::DDim trans_dims(in_dims);
+    phi::DDim trans_out_dims(out->dims());
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_out_dims[i] = out_dims[trans[i]];
+    }
+
+    DenseTensor trans_inp;
+    trans_inp.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_inp);
+    int ndims = trans.size();
+
+    // transpose the input value
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, *input, &trans_inp, trans);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    // Allocate the temp tensor to the save the topk indices, values
+    DenseTensor tmp_out;
+    DenseTensor tmp_indices;
+    tmp_out.Resize(trans_out_dims);
+    tmp_indices.Resize(trans_out_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    auto* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
+
+    // get the TopK value
+    FullTopK<T, int64_t>(input_height,
+                         input_width,
+                         in_dims.size(),
+                         &trans_inp,
+                         t_out,
+                         t_ind,
+                         k,
+                         largest,
+                         sorted);
+    // transpose back
+    funcs::TransCompute<phi::CPUContext, int64_t>(
+        ndims, dev_ctx, tmp_indices, indices, trans);
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, tmp_out, out, trans);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    top_k, CPU, ALL_LAYOUT, phi::TopkKernel, float, double, int32_t, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/triangular_solve_kernel.cc b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc
index 5aca5be127923..c91e7475f5b7c 100644
--- a/paddle/phi/kernels/cpu/triangular_solve_kernel.cc
+++ b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc b/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc
new file mode 100644
index 0000000000000..14aca258a2c71
--- /dev/null
+++ b/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(tril_triu_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TrilTriuGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/tril_triu_kernel.cc b/paddle/phi/kernels/cpu/tril_triu_kernel.cc
new file mode 100644
index 0000000000000..a3d20e55e21fb
--- /dev/null
+++ b/paddle/phi/kernels/cpu/tril_triu_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(tril_triu,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TrilTriuKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
new file mode 100644
index 0000000000000..fab49f5416048
--- /dev/null
+++ b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
@@ -0,0 +1,319 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/viterbi_decode_kernel.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/viterbi_decode_functor.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndType>
+struct Argmax {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  DenseTensor* out_idx,
+                  DenseTensor* out,
+                  int axis) {
+    phi::DDim input_dims = input.dims();
+    int64_t pre = 1;
+    int64_t post = 1;
+    int64_t n = input_dims[axis];
+    for (int i = 0; i < axis; i++) {
+      pre *= input_dims[i];
+    }
+    for (int i = axis + 1; i < input_dims.size(); i++) {
+      post *= input_dims[i];
+    }
+    int64_t height = pre * post;
+    int64_t width = n;
+    const T* in_data = input.data<T>();
+    IndType* out_idx_data = out_idx->data<IndType>();
+    T* out_data = out->data<T>();
+// Reduce
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+    for (int64_t i = 0; i < height; ++i) {
+      int64_t h = i / post;
+      int64_t w = i % post;
+      IndType max_idx = -1;
+      T max_value = (std::numeric_limits<T>::lowest)();  // for windows compile
+      for (int64_t j = 0; j < width; ++j) {
+        if (in_data[h * width * post + j * post + w] > max_value) {
+          max_value = in_data[h * width * post + j * post + w];
+          max_idx = j;
+        }
+      }
+      out_data[i] = max_value;
+      out_idx_data[i] = max_idx;
+    }
+  }
+};
+
+template <typename Context>
+struct ARange {
+  void operator()(const Context& dev_ctx,
+                  int64_t* data,
+                  int end,
+                  int64_t scale) {
+    for (int i = 0; i < end; ++i) {
+      data[i] = i * scale;
+    }
+  }
+};
+
+template <typename Context, typename T>
+struct GetMaxValue {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  T* max_value) {
+    auto input_ptr = input.data<T>();
+    auto num = input.numel();
+    *max_value = *std::max_element(input_ptr, input_ptr + num);
+  }
+};
+
+template <typename Context, typename T, typename IndexT = int>
+struct Gather {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& src,
+                  const DenseTensor& index,
+                  DenseTensor* output) {
+    phi::funcs::CPUGather<T, IndexT>(dev_ctx, src, index, output);
+  }
+};
+
+template <typename Context,
+          template <typename InT, typename OutT> typename CompareFunctor,
+          typename T>
+struct GetMask {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& lhs,
+                  const DenseTensor& rhs,
+                  DenseTensor* mask) {
+    funcs::SameDimsBinaryOP<int64_t, CompareFunctor<int64_t, T>, T>(
+        lhs, rhs, mask);
+  }
+};
+
+template <typename Context,
+          template <typename T> typename BinaryFunctor,
+          typename T>
+struct BinaryOperation {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& lhs,
+                  const DenseTensor& rhs,
+                  DenseTensor* output) {
+    if (lhs.dims() == rhs.dims()) {
+      funcs::SameDimsBinaryOP<T, BinaryFunctor<T>>(lhs, rhs, output);
+    } else {
+      bool is_multi_threads = false;
+#ifdef PADDLE_WITH_MKLML
+      if (omp_get_max_threads() > 1) {
+        is_multi_threads = true;
+      }
+#endif
+      if (is_multi_threads) {
+        funcs::SimpleBroadcastBinaryOP<T, BinaryFunctor<T>, true>(
+            lhs, rhs, output);
+      } else {
+        funcs::SimpleBroadcastBinaryOP<T, BinaryFunctor<T>, false>(
+            lhs, rhs, output);
+      }
+    }
+  }
+};
+
+template <typename T, typename Context>
+void ViterbiDecodeKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& transition,
+                         const DenseTensor& length,
+                         bool include_bos_eos_tag,
+                         DenseTensor* scores,
+                         DenseTensor* path) {
+  auto curr_place = dev_ctx.GetPlace();
+  auto batch_size = static_cast<int>(input.dims()[0]);
+  auto seq_len = static_cast<int>(input.dims()[1]);
+  auto n_labels = static_cast<int>(input.dims()[2]);
+  phi::funcs::SetConstant<Context, T> float_functor;
+  phi::funcs::SetConstant<Context, int64_t> int_functor;
+  std::vector<DenseTensor> historys;
+  // We create tensor buffer in order to avoid allocating memory frequently
+  // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
+  int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
+  DenseTensor int_buffer = Empty<int64_t>(dev_ctx, {buffer_size});
+  funcs::TensorBuffer int_tensor_buffer(int_buffer);
+  // create float tensor buffer
+  // 10 means allocate 10*batch_size*n_labels bytes, such as alpha, alpha_max
+  buffer_size = batch_size * (seq_len + 10) * n_labels +
+                (batch_size + 2) * n_labels * n_labels;
+  DenseTensor float_buffer = Empty<T>(dev_ctx, {buffer_size});
+  funcs::TensorBuffer float_tensor_buffer(float_buffer);
+  DenseTensor left_length = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  phi::Copy(dev_ctx, length, curr_place, false, &left_length);
+  int64_t max_seq_len = 0;
+  GetMaxValue<Context, int64_t> get_max_value;
+  get_max_value(dev_ctx, left_length, &max_seq_len);
+  dev_ctx.template Alloc<T>(scores);
+  path->Resize({batch_size, max_seq_len});
+  dev_ctx.template Alloc<int64_t>(path);
+  DenseTensor tpath =
+      int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
+  auto batch_path = funcs::Unbind(tpath);
+  for (auto it = batch_path.begin(); it != batch_path.end(); ++it) {
+    it->Resize({batch_size});
+  }
+  // create and init required tensor
+  DenseTensor input_exp =
+      float_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
+  TransposeKernel<T, Context>(dev_ctx, input, {1, 0, 2}, &input_exp);
+  DenseTensor trans_exp =
+      float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
+  phi::Copy(dev_ctx, transition, curr_place, false, &trans_exp);
+  trans_exp.Resize({1, n_labels, n_labels});
+  DenseTensor alpha =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  int_functor(dev_ctx, &zero, 0);
+  DenseTensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  int_functor(dev_ctx, &one, 1);
+  DenseTensor float_one = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+  float_functor(dev_ctx, &float_one, static_cast<T>(1.0));
+  DenseTensor alpha_trn_sum =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels, n_labels});
+  DenseTensor alpha_max =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor alpha_argmax =
+      int_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
+  auto alpha_argmax_unbind = funcs::Unbind(alpha_argmax);
+  DenseTensor alpha_nxt =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor zero_len_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor float_mask = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+  DenseTensor stop_trans = float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+  DenseTensor start_trans =
+      float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+  DenseTensor rest_trans =
+      float_tensor_buffer.GetBufferBlock({1, n_labels - 2, n_labels});
+  DenseTensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor last_ids_tmp = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor batch_offset = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor gather_idx = int_tensor_buffer.GetBufferBlock({batch_size});
+  std::vector<const DenseTensor*> shape{&rest_trans, &stop_trans, &start_trans};
+  std::vector<DenseTensor*> outputs{&rest_trans, &stop_trans, &start_trans};
+  phi::funcs::SplitFunctor<Context, T> split_functor;
+  split_functor(dev_ctx, trans_exp, shape, 1, &outputs);
+  stop_trans.Resize({1, n_labels});
+  start_trans.Resize({1, n_labels});
+  auto logit0 = input_exp.Slice(0, 1);
+  logit0.Resize({batch_size, n_labels});
+  BinaryOperation<Context, phi::funcs::AddFunctor, T> AddFloat;
+  BinaryOperation<Context, phi::funcs::AddFunctor, int64_t> AddInt;
+  BinaryOperation<Context, phi::funcs::MultiplyFunctor, T> MulFloat;
+  BinaryOperation<Context, phi::funcs::MultiplyFunctor, int64_t> MulInt;
+  BinaryOperation<Context, phi::funcs::SubtractFunctor, T> SubFloat;
+  BinaryOperation<Context, phi::funcs::SubtractFunctor, int64_t> SubInt;
+  if (include_bos_eos_tag) {
+    AddFloat(dev_ctx, logit0, start_trans, &alpha);
+    GetMask<Context, phi::funcs::EqualFunctor, T>()(
+        dev_ctx, left_length, one, &float_mask);
+    MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
+    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+  } else {
+    alpha = logit0;
+  }
+  SubInt(dev_ctx, left_length, one, &left_length);
+  Argmax<Context, T, int64_t> argmax;
+  for (int64_t i = 1; i < max_seq_len; ++i) {
+    DenseTensor logit = input_exp.Slice(i, i + 1);
+    logit.Resize({batch_size, n_labels});
+    DenseTensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
+    AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
+    auto alpha_argmax_temp = alpha_argmax_unbind[i - 1];
+    alpha_argmax_temp.Resize({batch_size, n_labels});
+    argmax(dev_ctx, alpha_trn_sum, &alpha_argmax_temp, &alpha_max, 1);
+    historys.emplace_back(alpha_argmax_temp);
+    AddFloat(dev_ctx, alpha_max, logit, &alpha_nxt);
+    alpha.Resize({batch_size, n_labels});
+    GetMask<Context, phi::funcs::GreaterThanFunctor, T>()(
+        dev_ctx, left_length, zero, &float_mask);
+    MulFloat(dev_ctx, alpha_nxt, float_mask, &alpha_nxt);
+    SubFloat(dev_ctx, float_one, float_mask, &float_mask);
+    MulFloat(dev_ctx, alpha, float_mask, &alpha);
+    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    if (include_bos_eos_tag) {
+      GetMask<Context, phi::funcs::EqualFunctor, T>()(
+          dev_ctx, left_length, one, &float_mask);
+      MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
+      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    }
+    SubInt(dev_ctx, left_length, one, &left_length);
+  }
+  argmax(dev_ctx, alpha, &last_ids, scores, 1);
+  left_length.Resize({batch_size});
+  GetMask<Context, phi::funcs::GreaterEqualFunctor, int64_t>()(
+      dev_ctx, left_length, zero, &int_mask);
+  // last_ids_update = last_ids * tag_mask
+  int last_ids_index = 1;
+  int actual_len = (std::min)(seq_len, static_cast<int>(max_seq_len));
+  MulInt(dev_ctx, last_ids, int_mask, &batch_path[actual_len - last_ids_index]);
+  // The algorithm below can refer to
+  // https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/layers/crf.py#L438
+  ARange<Context> arange;
+  arange(dev_ctx, batch_offset.data<int64_t>(), batch_size, n_labels);
+  Gather<Context, int64_t, int64_t> gather;
+  for (auto hist = historys.rbegin(); hist != historys.rend(); ++hist) {
+    ++last_ids_index;
+    AddInt(dev_ctx, left_length, one, &left_length);
+    AddInt(dev_ctx, batch_offset, last_ids, &gather_idx);
+    DenseTensor& last_ids_update = batch_path[actual_len - last_ids_index];
+    hist->Resize({batch_size * n_labels});
+    gather(dev_ctx, *hist, gather_idx, &last_ids_update);
+    GetMask<Context, phi::funcs::GreaterThanFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &int_mask);
+    MulInt(dev_ctx, last_ids_update, int_mask, &last_ids_update);
+    GetMask<Context, phi::funcs::EqualFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &zero_len_mask);
+    MulInt(dev_ctx, last_ids, zero_len_mask, &last_ids_tmp);
+    SubInt(dev_ctx, one, zero_len_mask, &zero_len_mask);
+    MulInt(dev_ctx, last_ids_update, zero_len_mask, &last_ids_update);
+    AddInt(dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update);
+    GetMask<Context, phi::funcs::LessThanFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &int_mask);
+    MulInt(dev_ctx, last_ids, int_mask, &last_ids);
+    AddInt(dev_ctx, last_ids_update, last_ids, &last_ids);
+  }
+  TransposeKernel<int64_t, Context>(dev_ctx, tpath, {1, 0}, path);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    viterbi_decode, CPU, ALL_LAYOUT, phi::ViterbiDecodeKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/where_index_kernel.cc b/paddle/phi/kernels/cpu/where_index_kernel.cc
new file mode 100644
index 0000000000000..da6eff74011ea
--- /dev/null
+++ b/paddle/phi/kernels/cpu/where_index_kernel.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_index_kernel.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+struct WhereIndexFunctor {
+  WhereIndexFunctor(
+      const T* true_index, int true_num, const T* stride, int rank, T* out)
+      : true_index_(true_index),
+        true_num_(true_num),
+        stride_(stride),
+        rank_(rank),
+        out_ptr_(out) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    T index = true_index_[idx];
+    for (int j = 0; j < rank_; j++) {
+      out_ptr_[idx * rank_ + j] = index / stride_[j];
+      index -= out_ptr_[idx * rank_ + j] * stride_[j];
+    }
+  }
+
+  const T* true_index_;
+  int true_num_;
+  const T* stride_;
+  int rank_;
+  T* out_ptr_;
+};
+
+template <typename T, typename Context>
+void WhereIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& condition,
+                      DenseTensor* out) {
+  const T* cond_data = condition.data<T>();
+  auto numel = condition.numel();
+  auto dims = condition.dims();
+  const int rank = dims.size();
+
+  std::vector<int64_t> true_index;
+  for (auto i = 0; i < numel; i++) {
+    if (static_cast<bool>(cond_data[i])) {
+      true_index.push_back(i);
+    }
+  }
+  auto true_num = true_index.size();
+  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
+  auto* out_ptr = dev_ctx.template Alloc<int64_t>(out);
+
+  if (true_num == 0) {
+    return;
+  }
+
+  std::vector<int64_t> stride(rank);
+  stride[rank - 1] = 1;
+  for (int i = rank - 2; i >= 0; i--) {
+    stride[i] = stride[i + 1] * dims[i + 1];
+  }
+
+  WhereIndexFunctor<int64_t> functor(
+      true_index.data(), true_num, stride.data(), rank, out_ptr);
+  phi::funcs::ForRange<phi::CPUContext> for_range(dev_ctx, true_num);
+  for_range(functor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(where_index,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::WhereIndexKernel,
+                   int64_t,
+                   int,
+                   int16_t,
+                   bool,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cumprod_grad_kernel.h b/paddle/phi/kernels/cumprod_grad_kernel.h
new file mode 100644
index 0000000000000..b3cb17b28e07f
--- /dev/null
+++ b/paddle/phi/kernels/cumprod_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CumprodGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& out,
+                       const DenseTensor& dout,
+                       int dim,
+                       DenseTensor* dx);
+}  // phi
diff --git a/paddle/phi/kernels/cumprod_kernel.h b/paddle/phi/kernels/cumprod_kernel.h
new file mode 100644
index 0000000000000..96d76cb0f4370
--- /dev/null
+++ b/paddle/phi/kernels/cumprod_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CumprodKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   int dim,
+                   DenseTensor* out);
+}  // phi
diff --git a/paddle/phi/kernels/cumsum_kernel.h b/paddle/phi/kernels/cumsum_kernel.h
index fd90c7b8f5eee..f105c94d559d8 100644
--- a/paddle/phi/kernels/cumsum_kernel.h
+++ b/paddle/phi/kernels/cumsum_kernel.h
@@ -18,7 +18,7 @@
 
 namespace phi {
 
-template <typename Functor, typename Context>
+template <typename T, typename Context>
 void CumsumKernel(const Context& dev_ctx,
                   const DenseTensor& x,
                   int axis,
diff --git a/paddle/phi/kernels/deformable_conv_kernel.h b/paddle/phi/kernels/deformable_conv_kernel.h
new file mode 100644
index 0000000000000..3886e6801a31b
--- /dev/null
+++ b/paddle/phi/kernels/deformable_conv_kernel.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DeformableConvKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& offset,
+                          const DenseTensor& filter,
+                          const DenseTensor& mask,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& paddings,
+                          const std::vector<int>& dilations,
+                          int deformable_groups,
+                          int groups,
+                          int im2col_step,
+                          DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/determinant_grad_kernel.h b/paddle/phi/kernels/determinant_grad_kernel.h
new file mode 100644
index 0000000000000..87228afc51b52
--- /dev/null
+++ b/paddle/phi/kernels/determinant_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DeterminantGradKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& out,
+                           const DenseTensor& out_grad,
+                           DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/infrt/kernel/phi/allocator_kernels.h b/paddle/phi/kernels/determinant_kernel.h
similarity index 79%
rename from paddle/infrt/kernel/phi/allocator_kernels.h
rename to paddle/phi/kernels/determinant_kernel.h
index d10382f5e6014..abd5f5691b3e5 100644
--- a/paddle/infrt/kernel/phi/allocator_kernels.h
+++ b/paddle/phi/kernels/determinant_kernel.h
@@ -14,15 +14,13 @@
 
 #pragma once
 
-#include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 
-namespace infrt {
-namespace kernel {
 namespace phi {
 
-backends::CpuPhiAllocator CreateCpuAllocator();
+template <typename T, typename Context>
+void DeterminantKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       DenseTensor* out);
 
 }  // namespace phi
-}  // namespace kernel
-}  // namespace infrt
diff --git a/paddle/phi/kernels/diag_grad_kernel.h b/paddle/phi/kernels/diag_grad_kernel.h
new file mode 100644
index 0000000000000..b9edab9bec44c
--- /dev/null
+++ b/paddle/phi/kernels/diag_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DiagGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    int offset,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/dropout_grad_kernel.h b/paddle/phi/kernels/dropout_grad_kernel.h
new file mode 100644
index 0000000000000..ae3f82056632d
--- /dev/null
+++ b/paddle/phi/kernels/dropout_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutGradRawKernel(const Context& dev_ctx,
+                          const DenseTensor& mask,
+                          const DenseTensor& out_grad,
+                          float p,
+                          bool is_test,
+                          const std::string& mode,
+                          DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/dropout_kernel.h b/paddle/phi/kernels/dropout_kernel.h
new file mode 100644
index 0000000000000..dc9f89e08e17a
--- /dev/null
+++ b/paddle/phi/kernels/dropout_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      paddle::optional<const DenseTensor&> seed_tensor,
+                      float p,
+                      bool is_test,
+                      const std::string& mode,
+                      int seed,
+                      bool fix_seed,
+                      DenseTensor* out,
+                      DenseTensor* mask);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/eigh_kernel.h b/paddle/phi/kernels/eigh_kernel.h
index dd28752d92983..1965391830241 100644
--- a/paddle/phi/kernels/eigh_kernel.h
+++ b/paddle/phi/kernels/eigh_kernel.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/device_context.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/elementwise_grad_kernel.h b/paddle/phi/kernels/elementwise_grad_kernel.h
index bcd5a98f07ee9..fb2633cc9fcea 100644
--- a/paddle/phi/kernels/elementwise_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_grad_kernel.h
@@ -85,4 +85,61 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
                             DenseTensor* dy,
                             DenseTensor* dout,
                             DenseTensor* ddout);
+
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy);
+
+template <typename T, typename Context>
+void MultiplyDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              int axis,
+                              DenseTensor* dx,
+                              DenseTensor* dy,
+                              DenseTensor* ddout);
+
+template <typename T, typename Context>
+void MultiplyTripleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& d_dx,
+                              const DenseTensor& d_dy,
+                              paddle::optional<const DenseTensor&> d_ddout,
+                              int axis,
+                              DenseTensor* d_x,
+                              DenseTensor* d_y,
+                              DenseTensor* d_dout,
+                              DenseTensor* d_ddx,
+                              DenseTensor* d_ddy);
+
+template <typename T, typename Context>
+void ElementwiseFMaxGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out_grad,
+                               int axis,
+                               DenseTensor* x_grad,
+                               DenseTensor* y_grad);
+
+template <typename T, typename Context>
+void ElementwiseFMinGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out_grad,
+                               int axis,
+                               DenseTensor* x_grad,
+                               DenseTensor* y_grad);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc
similarity index 70%
rename from paddle/phi/kernels/math_kernel.cc
rename to paddle/phi/kernels/elementwise_kernel.cc
index a5d3f51e5447f..9d10a48c9e079 100644
--- a/paddle/phi/kernels/math_kernel.cc
+++ b/paddle/phi/kernels/elementwise_kernel.cc
@@ -12,34 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
 
-template <typename T, typename Context>
-void MeanKernel(const Context& dev_ctx,
-                const DenseTensor& x,
-                const std::vector<int64_t>& dims,
-                bool keep_dim,
-                DenseTensor* out) {
-  bool reduce_all = false;
-  MeanRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
-}
-
-template <typename T, typename Context>
-void SumKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               DataType out_dtype,
-               bool keep_dim,
-               DenseTensor* out) {
-  bool reduce_all = false;
-  SumRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out);
-}
-
 template <typename T, typename Context>
 void AddKernel(const Context& dev_ctx,
                const DenseTensor& x,
@@ -81,25 +60,6 @@ void MultiplyKernel(const Context& dev_ctx,
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
-PD_REGISTER_KERNEL(
-    mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {}
-
-PD_REGISTER_KERNEL(sum,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SumKernel,
-                   bool,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
-
 PD_REGISTER_KERNEL(add,
                    CPU,
                    ALL_LAYOUT,
@@ -147,32 +107,7 @@ PD_REGISTER_KERNEL(multiply,
                    phi::dtype::bfloat16) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(mean,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MeanKernel,
-                   float,
-                   double,
-                   bool,
-                   int,
-                   int64_t,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(sum,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SumKernel,
-                   bool,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
+
 PD_REGISTER_KERNEL(add,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/math_kernel.h b/paddle/phi/kernels/elementwise_kernel.h
similarity index 57%
rename from paddle/phi/kernels/math_kernel.h
rename to paddle/phi/kernels/elementwise_kernel.h
index 7569cbcff087d..b064ecc454c59 100644
--- a/paddle/phi/kernels/math_kernel.h
+++ b/paddle/phi/kernels/elementwise_kernel.h
@@ -1,57 +1,37 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/infermeta/binary.h"
-#include "paddle/phi/infermeta/unary.h"
-#include "paddle/phi/kernels/empty_kernel.h"
 
 namespace phi {
 
 template <typename T, typename Context>
-void MeanRawKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const std::vector<int64_t>& dims,
-                   bool keep_dim,
-                   bool reduce_all,
-                   DenseTensor* out);
-
-template <typename T, typename Context>
-void MeanKernel(const Context& dev_ctx,
-                const DenseTensor& x,
-                const std::vector<int64_t>& dims,
-                bool keep_dim,
-                DenseTensor* out);
+void ElementwiseFMaxKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           int axis,
+                           DenseTensor* out);
 
 template <typename T, typename Context>
-void SumRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
-                  bool keep_dim,
-                  bool reduce_all,
-                  DataType out_dtype,
-                  DenseTensor* out);
-
-template <typename T, typename Context>
-void SumKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               DataType out_dtype,
-               bool keep_dim,
-               DenseTensor* out);
+void ElementwiseFMinKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           int axis,
+                           DenseTensor* out);
 
 template <typename T, typename Context>
 void AddRawKernel(const Context& dev_ctx,
@@ -149,29 +129,4 @@ DenseTensor Multiply(const Context& dev_ctx,
   return dense_out;
 }
 
-template <typename T, typename Context>
-DenseTensor Mean(const Context& dev_ctx,
-                 const DenseTensor& x,
-                 const std::vector<int64_t>& axis,
-                 bool keep_dim) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out);
-  MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Sum(const Context& dev_ctx,
-                const DenseTensor& x,
-                const std::vector<int64_t>& axis,
-                DataType dtype,
-                bool keep_dim) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  SumInferMeta(x, axis, dtype, keep_dim, &meta_out);
-  SumKernel<T, Context>(dev_ctx, x, axis, dtype, keep_dim, &dense_out);
-  return dense_out;
-}
-
 }  // namespace phi
diff --git a/paddle/phi/kernels/erf_grad_kernel.h b/paddle/phi/kernels/erf_grad_kernel.h
new file mode 100644
index 0000000000000..8957fcaf79b9a
--- /dev/null
+++ b/paddle/phi/kernels/erf_grad_kernel.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out_grad,
+                   DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/fluid/platform/dynload/lapack.cc b/paddle/phi/kernels/erf_kernel.h
similarity index 61%
rename from paddle/fluid/platform/dynload/lapack.cc
rename to paddle/phi/kernels/erf_kernel.h
index 5a21bb4d041d9..1d5c57d2201c7 100644
--- a/paddle/fluid/platform/dynload/lapack.cc
+++ b/paddle/phi/kernels/erf_kernel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,16 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/platform/dynload/lapack.h"
+#pragma once
 
-namespace paddle {
-namespace platform {
-namespace dynload {
+#include "paddle/phi/core/dense_tensor.h"
 
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
+namespace phi {
 
-LAPACK_ROUTINE_EACH(DEFINE_WRAP);
+template <typename T, typename Context>
+void ErfKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/kernels/expand_as_grad_kernel.h b/paddle/phi/kernels/expand_as_grad_kernel.h
new file mode 100644
index 0000000000000..675e03c42a347
--- /dev/null
+++ b/paddle/phi/kernels/expand_as_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ExpandAsGradKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& out_grad,
+                        const std::vector<int>& target_shape,
+                        DenseTensor* in_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/expand_as_kernel.h b/paddle/phi/kernels/expand_as_kernel.h
new file mode 100644
index 0000000000000..971ea32310f3e
--- /dev/null
+++ b/paddle/phi/kernels/expand_as_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ExpandAsKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    paddle::optional<const DenseTensor&> y,
+                    const std::vector<int>& target_shape,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/frobenius_norm_grad_kernel.h b/paddle/phi/kernels/frobenius_norm_grad_kernel.h
new file mode 100644
index 0000000000000..edf3aed8b8493
--- /dev/null
+++ b/paddle/phi/kernels/frobenius_norm_grad_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void FrobeniusNormGradKernel(const Context& ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& out,
+                             const DenseTensor& dout,
+                             const std::vector<int64_t>& axis,
+                             bool keep_dim,
+                             bool reduce_all,
+                             DataType in_dtype,
+                             DataType out_dtype,
+                             DenseTensor* dx);
+}  // namespace phi
diff --git a/paddle/phi/kernels/frobenius_norm_kernel.h b/paddle/phi/kernels/frobenius_norm_kernel.h
new file mode 100644
index 0000000000000..f5f37ee0c0fa5
--- /dev/null
+++ b/paddle/phi/kernels/frobenius_norm_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void FrobeniusNormKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const std::vector<int64_t>& axis,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index f0fbb7bf0849b..942eecae16837 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -3,10 +3,12 @@ add_subdirectory(blas)
 add_subdirectory(lapack)
 add_subdirectory(detail)
 
-math_library(math_function DEPS blas dense_tensor tensor)
-math_library(sequence2batch)
+math_library(concat_and_split_functor DEPS dense_tensor)
 math_library(gru_compute DEPS activation_functions math_function)
 math_library(lstm_compute DEPS activation_functions)
-math_library(concat_and_split_functor DEPS dense_tensor)
+math_library(math_function DEPS blas dense_tensor tensor)
 math_library(matrix_reduce DEPS dense_tensor)
 math_library(matrix_inverse DEPS dense_tensor eigen3 blas)
+math_library(pooling DEPS dense_tensor)
+math_library(segment_pooling)
+math_library(sequence2batch)
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index 1a36e4e132f41..6c5ffbd06e3a4 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -29,11 +29,17 @@
 #include <type_traits>
 
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/extensions.h"
+
+#ifdef PADDLE_WITH_XPU_KP
+#define __forceinline__ __inline__
+#endif
 
 namespace phi {
 namespace funcs {
@@ -513,24 +519,24 @@ struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
-struct CudaReluFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-
-  // relu(x) = max(x, 0)
-  __device__ __forceinline__ T operator()(const T x) const {
-    return x > zero ? x : zero;
+struct TanhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.tanh();
   }
 };
 
 template <typename T>
-struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-
-  // dx = dout * (out > 0)
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return out > zero ? dout : zero;
+struct TanhGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<T>(1) - out * out);
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() {
@@ -539,291 +545,1431 @@ struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
 };
 
 template <typename T>
-struct CudaCosFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // cos(x) = cos(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(cos(x));
+struct TanhGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  const DenseTensor* dOut,
+                  DenseTensor* dOutNew,
+                  DenseTensor* ddOut) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad"));
+    // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out
+    // * ddx)
+    if (dOutNew) {
+      auto dout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad"));
+      auto dout_new = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad"));
+      dout_new.device(*d) =
+          static_cast<T>(-1) * dout * static_cast<T>(2) * out * ddx;
+    }
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad"));
+      ddout.device(*d) = (static_cast<T>(1) - out * out) * ddx;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
   }
 };
+/*
+    Out
+    DOut                            D_Dout
+    DDx     -> TanhTripleGrad ->    D_DDx
+    D_DDout                         d_OutNew
+    D_Dout_new
 
-template <typename T>
-struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+    D_Dout = (-2) * Out * DDx * D_Dout_new
+    D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new
+    D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new
 
-  // dx = dout * (-sin(x))
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(-dout * sin(x));
+    Out, DDX, DOut, D_DDOut, D_DOut_New   // input
+    D_OutNew, D_DOut, D_DDx               // output
+*/
+template <typename T>
+struct TanhTripleGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  const DenseTensor* dOut,
+                  const DenseTensor* d_DDOut,
+                  const DenseTensor* d_dOut_New,
+                  DenseTensor* d_d_Out,
+                  DenseTensor* d_Out_New,
+                  DenseTensor* d_DDx) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad"));
+    auto dout = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad"));
+    auto d_ddOut = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad"));
+    auto d_dOutNew = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad"));
+
+    if (d_Out_New) {
+      auto d_OutNew = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad"));
+      d_OutNew.device(*d) = (static_cast<T>(-2) * out * ddx * d_ddOut) -
+                            (static_cast<T>(2) * dout * ddx * d_dOutNew);
+    }
+    if (d_d_Out) {
+      auto d_dOut = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad"));
+      d_dOut.device(*d) = static_cast<T>(-2) * out * ddx * d_dOutNew;
+    }
+    if (d_DDx) {
+      auto d_ddx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad"));
+      d_ddx.device(*d) = (static_cast<T>(1) - (out * out)) * d_ddOut -
+                         static_cast<T>(2) * out * dout * d_dOutNew;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
   }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaSinFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct BReluFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
 
-  // sin(x) = sin(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(sin(x));
+  // NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
+  // not polymorphism for speed.
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
+        x.cwiseMax(static_cast<T>(t_min)).cwiseMin(static_cast<T>(t_max));
   }
 };
 
 template <typename T>
-struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // dx = dout * cos(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * cos(x));
+struct BReluGradFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
+                   ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
+                       .template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaTanFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct LeakyReluFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
 
-  // tan(x) = tan(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(tan(x));
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    if (alpha < 1.f) {
+      out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
+    } else {
+      out.device(d) = x.cwiseMin(static_cast<T>(alpha) * x);
+    }
   }
 };
 
 template <typename T>
-struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // dx = dout / cos(x)^2
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout / (cos(x) * cos(x)));
+struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 =
+        static_cast<T>(alpha) * (x < static_cast<T>(0)).template cast<T>();
+    auto temp2 = (x >= static_cast<T>(0)).template cast<T>();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaAsinFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // asin(x) = asin(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(asin(x));
+struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
   }
-};
-
-template <typename T>
-struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // dx = dout / sqrt(1 - x^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout / sqrt(one - x * x));
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* X,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  DenseTensor* ddOut,
+                  DenseTensor* dOut,
+                  DenseTensor* dX) const {
+    if (ddOut) {
+      auto* d = dev.eigen_device();
+      auto ddx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad"));
+      auto x = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad"));
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad"));
+      ddout.device(*d) =
+          ddx *
+          ((x > static_cast<T>(0)).template cast<T>() +
+           static_cast<T>(alpha) * (x <= static_cast<T>(0)).template cast<T>())
+              .template cast<T>();
+    }
   }
-
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaAcosFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
 
-  // acos(x) = acos(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(acos(x));
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto th = static_cast<T>(threshold);
+    out.device(d) = (x > th).template cast<T>() * x;
   }
 };
 
 template <typename T>
-struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
+struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
 
-  // dx = -dout / sqrt(1 - x^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(-dout / sqrt(one - x * x));
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto th = static_cast<T>(threshold);
+    dx.device(d) = dout * (x > th).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+// tanhshrink(x) = x - tanh(x)
+// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
-struct CudaCoshFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // cosh(x) = cosh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(cosh(x));
+struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x - x.tanh();
   }
 };
 
 template <typename T>
-struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // dx = dout * sinh(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * sinh(x));
+struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (x.tanh() * x.tanh());
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+// tanhshrink(x) = x - tanh(x)
+// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
-struct CudaSinhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct HardShrinkFunctor : public BaseActivationFunctor<T> {
+  float threshold;
 
-  // sinh(x) = sinh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(sinh(x));
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp1 = x < static_cast<T>(threshold * -1.f);
+    auto temp2 = x > static_cast<T>(threshold);
+    out.device(d) = x * (temp1 || temp2).template cast<T>();
   }
 };
 
 template <typename T>
-struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
 
-  // dx = dout * cosh(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * cosh(x));
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 = x < static_cast<T>(threshold * -1.f);
+    auto temp2 = x > static_cast<T>(threshold);
+    dx.device(d) = dout * (temp1 || temp2).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
+// otherwise
 template <typename T>
-struct CudaAcoshFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
+  float lambda;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
 
-  // Acosh(x) = acosh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(acosh(x));
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto lambdaT = static_cast<T>(lambda);
+    auto temp1 = (x > lambdaT).template cast<T>();
+    auto temp2 = (x < -lambdaT).template cast<T>();
+    out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
   }
 };
 
 template <typename T>
-struct CudaAcoshGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  // dx = dout * 1 / sqrt(x^2 - 1)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / sqrt(x * x - one));
+struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
+  float lambda;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto lambdaT = static_cast<T>(lambda);
+    auto temp1 = (x > lambdaT).template cast<T>();
+    auto temp2 = (x < -lambdaT).template cast<T>();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaAsinhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct ELUFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
 
-  // Asinh(x) = asinh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
+        (x < static_cast<T>(0))
+            .select(static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)), x);
+  }
+};
+
+template <typename T>
+struct ELUGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    // case 1: alpha >= 0
+    // dx = dout, if out > 0
+    // dx = dout * (out + alpha), if out <= 0
+    dx.device(d) = (out > static_cast<T>(0))
+                       .select(dout, dout * (out + static_cast<T>(alpha)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    // case 2: alpha < 0
+    // dx = dout, if x > 0
+    // dx = dout * (out + alpha), if x <=0
+    dx.device(d) = (x > static_cast<T>(0))
+                       .select(dout, dout * static_cast<T>(alpha) * x.exp());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* X,
+                  const DenseTensor* ddX,
+                  DenseTensor* ddOut,
+                  const DenseTensor* dOut,
+                  DenseTensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "ELUGradGrad"));
+    auto x = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "ELUGradGrad"));
+
+    if (dX) {
+      auto dx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dX, "Output", "DX", "ELUGradGrad"));
+      auto dout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Output", "DOut", "ELUGradGrad"));
+      dx.device(*d) = ddx * dout * static_cast<T>(alpha) * x.exp() *
+                      (x <= static_cast<T>(0)).template cast<T>();
+    }
+
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad"));
+      ddout.device(*d) = ddx *
+                         ((x > static_cast<T>(0)).template cast<T>() +
+                          static_cast<T>(alpha) * x.exp() *
+                              (x <= static_cast<T>(0)).template cast<T>())
+                             .template cast<T>();
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+// silu(x) = x / (1 + exp(-x))
+template <typename T>
+struct SiluFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+    out.device(d) = x * temp;
+  }
+};
+
+// silu'(x) = (1 / (1 + e^{-x}))  * (1 + out * e^{-x}))
+template <typename T>
+struct SiluGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 = static_cast<T>(1) + (-x).exp();  // 1+e^(-x)
+    auto temp2 = x * (-x).exp();                  // x*e^(-x)
+    dx.device(d) = dout * ((static_cast<T>(1) / temp1) *
+                           (static_cast<T>(1) + (temp2 / temp1)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+// sigmoid(x) = 1 / (1 + exp(-x))
+template <typename T>
+struct SigmoidFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+  }
+};
+
+template <typename T>
+struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * out * (static_cast<T>(1) - out);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+/*
+    Out
+    DOut -> SigmoidGradGrad -> DOutNew
+    DDX                        DDOut
+
+    DDOut = (1-Out)*Out*DDX
+    DOutNew = (1-2*Out)*DOut*DDX
+*/
+template <typename T>
+struct SigmoidGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  const DenseTensor* dOut,
+                  DenseTensor* dOutNew,
+                  DenseTensor* ddOut) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad"));
+
+    if (dOutNew) {
+      auto dout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad"));
+      auto dout_new = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SigmoidGradGrad"));
+      dout_new.device(*d) =
+          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * ddx;
+    }
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SigmoidGradGrad"));
+      ddout.device(*d) = (static_cast<T>(1) - out) * out * ddx;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+/*
+    Out
+    DOut                            D_Dout
+    DDx     -> SigmoidTripleGrad -> D_DDx
+    D_DDout                         d_OutNew
+    D_Dout_new
+
+    D_Dout = (1-2*Out)*DDx*D_Dout_new
+    D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new
+    D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new
+
+    Out, DDX, DOut, D_DDOut, D_DOut_New   // input
+    D_OutNew, D_DOut, D_DDx               // output
+*/
+template <typename T>
+struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  const DenseTensor* dOut,
+                  const DenseTensor* d_DDOut,
+                  const DenseTensor* d_dOut_New,
+                  DenseTensor* d_d_Out,
+                  DenseTensor* d_Out_New,
+                  DenseTensor* d_DDx) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidTripleGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidTripleGrad"));
+    auto dout = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidTripleGrad"));
+    auto d_ddOut = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad"));
+    auto d_dOutNew = EigenVector<T>::Flatten(GET_DATA_SAFELY(
+        d_dOut_New, "Input", "D_DOut_New", "SigmoidTripleGrad"));
+
+    if (d_Out_New) {
+      auto d_OutNew = EigenVector<T>::Flatten(GET_DATA_SAFELY(
+          d_Out_New, "Output", "D_OutNew", "SigmoidTripleGrad"));
+      d_OutNew.device(*d) = (ddx - static_cast<T>(2) * out * ddx) * d_ddOut -
+                            static_cast<T>(2) * dout * ddx * d_dOutNew;
+    }
+    if (d_d_Out) {
+      auto d_dOut = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "SigmoidTripleGrad"));
+      d_dOut.device(*d) =
+          (static_cast<T>(1) - static_cast<T>(2) * out) * ddx * d_dOutNew;
+    }
+    if (d_DDx) {
+      auto d_ddx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "SigmoidTripleGrad"));
+      d_ddx.device(*d) =
+          (static_cast<T>(1) - out) * out * d_ddOut +
+          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * d_dOutNew;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+// Originally: logsigmoid(x) = -log (1 + exp(-x))
+// For numerical stability, we can use the log-sum-exp trick:
+// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
+// We can rewrite the above equation as:
+// out = -log( exp(0) + exp(-x)) [since exp(0) = 1]
+//   = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
+//   = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
+//           max(-x, 0)))
+//   = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
+//   = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))
+//
+// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0))
+// + exp(-x - max(-x, 0))))
+template <typename T>
+struct LogSigmoidFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
+    out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log());
+  }
+};
+
+// Originally: f' = exp(-x) / (1 + exp(-x))
+// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) +
+// exp(-x - max(-x, 0)))
+template <typename T>
+struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
+    dx.device(d) =
+        dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = x * static_cast<T>(slope) + static_cast<T>(offset);
+    out.device(d) =
+        temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
+  }
+};
+
+template <typename T>
+struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
+                   ((out > static_cast<T>(0)) * (out < static_cast<T>(1)))
+                       .template cast<T>() *
+                   static_cast<T>(slope);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+template <typename T>
+struct CudaReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+
+  // relu(x) = max(x, 0)
+  __device__ __forceinline__ T operator()(const T x) const {
+    return x > zero ? x : zero;
+  }
+};
+
+template <typename T>
+struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+
+  // dx = dout * (out > 0)
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return out > zero ? dout : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaCosFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // cos(x) = cos(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(cos(x));
+  }
+};
+
+template <typename T>
+struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * (-sin(x))
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(-dout * sin(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSinFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // sin(x) = sin(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(sin(x));
+  }
+};
+
+template <typename T>
+struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * cos(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * cos(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaTanFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // tan(x) = tan(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(tan(x));
+  }
+};
+
+template <typename T>
+struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout / cos(x)^2
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout / (cos(x) * cos(x)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAsinFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // asin(x) = asin(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(asin(x));
+  }
+};
+
+template <typename T>
+struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = dout / sqrt(1 - x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout / sqrt(one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAcosFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // acos(x) = acos(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(acos(x));
+  }
+};
+
+template <typename T>
+struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = -dout / sqrt(1 - x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(-dout / sqrt(one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaCoshFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // cosh(x) = cosh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(cosh(x));
+  }
+};
+
+template <typename T>
+struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * sinh(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * sinh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSinhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // sinh(x) = sinh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(sinh(x));
+  }
+};
+
+template <typename T>
+struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * cosh(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * cosh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAcoshFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Acosh(x) = acosh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(acosh(x));
+  }
+};
+
+template <typename T>
+struct CudaAcoshGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  // dx = dout * 1 / sqrt(x^2 - 1)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * one / sqrt(x * x - one));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAsinhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Asinh(x) = asinh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(asinh(x));
+  }
+};
+
+template <typename T>
+struct CudaAsinhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = dout * 1/sqrt(x^2 + 1)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * one / sqrt(x * x + one));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAtanhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Atanh(x) = atanh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(atanh(x));
+  }
+};
+
+template <typename T>
+struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  // dx = dout * 1/(1- x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(asinh(x));
+    return static_cast<T>(dout * one / (one - x * x));
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaAsinhGradFunctor : public BaseActivationFunctor<T> {
+struct CudaAtanFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
 
-  // dx = dout * 1/sqrt(x^2 + 1)
+  // atan(x) = atan(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(atan(x));
+  }
+};
+
+template <typename T>
+struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout / (1 + x^2)
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout / (one + x * x);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaTanhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // tanh(x) = tanh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(tanh(x));
+  }
+};
+
+template <typename T>
+struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout * (1 - out^2)
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return dout * (one - out * out);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaBReluFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  // brelu(x) = min(max(x, t_min), t_max)
+  __device__ __forceinline__ T operator()(const T x) const {
+    T t_min_cast = static_cast<T>(t_min);
+    T t_max_cast = static_cast<T>(t_max);
+    T temp_max = x > t_min_cast ? x : t_min_cast;
+    T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast;
+    return temp_min;
+  }
+};
+
+template <typename T>
+struct CudaBReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float t_min;
+  float t_max;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  // dx = (x > t_min && x < t_max) ? dout : 0
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    T t_min_cast = static_cast<T>(t_min);
+    T t_max_cast = static_cast<T>(t_max);
+    return (x > t_min_cast && x < t_max_cast) ? dout : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaThresholdedReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // thresholded_relu(x) = x > threshold ? x : 0
+  __device__ __forceinline__ T operator()(const T x) const {
+    return x > static_cast<T>(threshold) ? x : zero;
+  }
+};
+
+template <typename T>
+struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // dx = x > threshold ? dout : 0
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return x > static_cast<T>(threshold) ? dout : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // leakyrelu(x) = x > 0 ? x : alpha * x
+  __device__ __forceinline__ T operator()(const T x) const {
+    return x > zero ? x : static_cast<T>(alpha) * x;
+  }
+};
+
+template <typename T>
+struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // dx = dout * (x > 0 ? 1 : alpha)
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return x > zero ? dout : static_cast<T>(alpha) * dout;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSoftShrinkFunctor : public BaseActivationFunctor<T> {
+  float lambda;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+
+  // softshrink(x) = x - lambda, if x > lambda;
+  //                 x + lambda, if x < -lambda;
+  //                 0, otherwise.
+  __device__ __forceinline__ T operator()(const T x) const {
+    T l = static_cast<T>(lambda);
+    T temp1 = static_cast<T>(x > l);
+    T temp2 = static_cast<T>(x < -l);
+    return temp1 * (x - l) + temp2 * (x + l);
+  }
+};
+
+template <typename T>
+struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float lambda;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+
+  // dx = dout, if x > lambda or x < -lambda else 0
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    T l = static_cast<T>(lambda);
+    return (x >= -l && x <= l) ? zero : dout;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaTanhShrinkFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // tanhshrink(x) = x - tanh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(x - tanh(x));
+  }
+};
+
+template <typename T>
+struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * tanh(x)^2
   __device__ __forceinline__ T operator()(const T arg_dout,
                                           const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / sqrt(x * x + one));
+    return static_cast<T>(dout * tanh(x) * tanh(x));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaAtanhFunctor : public BaseActivationFunctor<T> {
+struct CudaHardShrinkFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x
+  __device__ __forceinline__ T operator()(const T x) const {
+    T t = static_cast<T>(threshold);
+    return (x > -t && x < t) ? zero : x;
+  }
+};
+
+template <typename T>
+struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // dx = (x > -threshold && x < threshold) ? 0 : dout
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    T t = static_cast<T>(threshold);
+    return (x > -t && x < t) ? zero : dout;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaELUFunctor : public BaseActivationFunctor<T> {
+  using CT = typename phi::dtype::MPTypeTrait<T>::Type;
+  CT zero = static_cast<CT>(0.0f);
+  CT one = static_cast<CT>(1.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // elu(x) = x, if x > 0
+  // elu(x) = alpha * (e^x - 1), if x <= 0
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    CT x = static_cast<CT>(arg_x);
+    CT temp = static_cast<CT>(alpha) * (exp(x) - one);
+    CT res = x > zero ? x : temp;
+    return static_cast<T>(res);
+  }
+};
+
+template <typename T>
+struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+  float alpha;
 
-  // Atanh(x) = atanh(x)
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // case 1: alpha >= 0
+  // dx = dout, if out > 0
+  // dx = dout * (out + alpha), if out <= 0
+  __device__ __forceinline__ T operator()(T arg_dout, T arg_out) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType out = static_cast<MPType>(arg_out);
+    MPType a = static_cast<MPType>(alpha);
+    MPType out_pos = static_cast<MPType>(out > zero);
+    MPType out_neg = static_cast<MPType>(out <= zero);
+    return static_cast<T>(dout * (out_pos + out_neg * (out + a)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // case 2: alpha < 0
+  // dx = dout, if x > 0
+  // dx = dout * (out + alpha), if x <=0
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_out,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType out = static_cast<MPType>(arg_out);
+    MPType x = static_cast<MPType>(arg_x);
+    MPType a = static_cast<MPType>(alpha);
+    MPType x_pos = static_cast<MPType>(x > zero);
+    MPType x_neg = static_cast<MPType>(x <= zero);
+    return static_cast<T>(dout * (x_pos + x_neg * (out + a)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSiluFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // silu(x) = x / (1 + exp(-x))
   __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(atanh(x));
+    return static_cast<T>(x / (one + exp(-x)));
   }
 };
 
 template <typename T>
-struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
+struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
   MPType one = static_cast<MPType>(1.0f);
-  // dx = dout * 1/(1- x^2)
+
+  // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2)
   __device__ __forceinline__ T operator()(const T arg_dout,
                                           const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / (one - x * x));
+    MPType temp = one / (one + exp(-x));
+    return static_cast<T>(dout * (temp * (one + x * (one - temp))));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaAtanFunctor : public BaseActivationFunctor<T> {
+struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
 
-  // atan(x) = atan(x)
+  // sigmoid(x) = 1 / (1 + exp(-x))
   __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(atan(x));
+    return static_cast<T>(one / (one + exp(-x)));
   }
 };
 
 template <typename T>
-struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
+struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
 
-  // dx = dout / (1 + x^2)
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout / (one + x * x);
+  // dx = dout * out * (1 - out)
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return dout * out * (one - out);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+
+  // logsigmoid(x) = log(1 / (1 + exp(-x)))
+  // For numerical stability,
+  // logsigmoid(x) =
+  //          - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType temp = x > zero ? zero : -x;
+    return static_cast<T>(-temp - log(exp(-temp) + exp(-x - temp)));
+  }
+};
+
+template <typename T>
+struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+
+  // dx = dout * exp(-x) / (1 + exp(-x))
+  // For numerical stability:
+  // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x,
+  // 0)))
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    MPType temp1 = x > zero ? zero : -x;
+    MPType temp2 = exp(-x - temp1);
+    return static_cast<T>(dout * (temp2 / (exp(-temp1) + temp2)));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+template <typename T>
+struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  T one = static_cast<T>(1.0f);
+  float slope;
+  float offset;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  // hard_sigmoid(x) = 0, when x <= -3
+  //                   1, when x >= 3
+  //                   x * slope + offset, otherwise
+  __device__ __forceinline__ T operator()(const T x) const {
+    T temp = x * static_cast<T>(slope) + static_cast<T>(offset);
+    T temp_max = temp > zero ? temp : zero;
+    T temp_min = temp_max < one ? temp_max : one;
+    return temp_min;
+  }
+};
+
+template <typename T>
+struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  T one = static_cast<T>(1.0f);
+  float slope;
+  float offset;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  // dx = (out > 0 && out < 1) ? dout * slope : 0
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return (out > zero && out < one) ? dout * static_cast<T>(slope) : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
 #endif
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
index 840c8872f50f8..06be592dd9375 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -395,6 +395,8 @@ struct ConcatFunctor<phi::GPUContext, T> {
     auto* data_alloc_released = data_alloc.release();
     auto* col_alloc_released = col_alloc.release();
     context.AddStreamCallback([data_alloc_released, col_alloc_released] {
+      VLOG(4) << "Delete cuda pinned at " << data_alloc_released;
+      VLOG(4) << "Delete cuda pinned at " << col_alloc_released;
       paddle::memory::allocation::Allocator::AllocationDeleter(
           data_alloc_released);
       paddle::memory::allocation::Allocator::AllocationDeleter(
diff --git a/paddle/phi/kernels/funcs/cumprod.h b/paddle/phi/kernels/funcs/cumprod.h
new file mode 100644
index 0000000000000..ac40523c1c437
--- /dev/null
+++ b/paddle/phi/kernels/funcs/cumprod.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+static void GetCumprodDimInfo(const DDim& dim,
+                              int cumprod_dim,
+                              size_t* outer_dim,
+                              size_t* mid_dim,
+                              size_t* inner_dim) {
+  PADDLE_ENFORCE_GE(
+      cumprod_dim,
+      -dim.size(),
+      phi::errors::InvalidArgument(
+          "The input dim of CumprodOp should be larger than the opposite "
+          "rank of input x which is %d.But received dim=%d",
+          -dim.size(),
+          cumprod_dim));
+  PADDLE_ENFORCE_LT(cumprod_dim,
+                    dim.size(),
+                    phi::errors::InvalidArgument(
+                        "The input dim of CumprodOp should be smaller than the "
+                        "rank of input x which is %d.But received dim=%d",
+                        dim.size(),
+                        cumprod_dim));
+  if (cumprod_dim < 0) cumprod_dim += dim.size();
+
+  *outer_dim = 1;
+  for (int i = 0; i < cumprod_dim; ++i) {
+    *outer_dim *= dim[i];
+  }
+  *mid_dim = dim[cumprod_dim];
+  *inner_dim = 1;
+  for (int i = cumprod_dim + 1; i < dim.size(); ++i) {
+    *inner_dim *= dim[i];
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index 5615a450b5c54..ac262fe2d571e 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -67,6 +67,11 @@ struct InverseMultiplyFunctor<bool> {
   }
 };
 
+template <typename T>
+struct IsZeroFunctor {
+  HOSTDEVICE bool operator()(T x) const { return x == static_cast<T>(0); }
+};
+
 // Divide
 #define DIV_ERROR_INFO                                             \
   "InvalidArgumentError: Integer division by zero encountered in " \
@@ -159,6 +164,263 @@ struct DivGradYFunctor<ComplexType<T>> {
     return -a * out_div_c_conj;
   }
 };
+// Fmin
+template <typename T>
+struct FMinFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    return std::fmin(a, b);
+  }
+};
+
+template <>
+struct FMinFunctor<dtype::float16> {
+  inline HOSTDEVICE dtype::float16 operator()(const dtype::float16 a,
+                                              const dtype::float16 b) const {
+    float float_a = static_cast<float>(a);
+    float float_b = static_cast<float>(b);
+    auto result = std::fmin(float_a, float_b);
+    return static_cast<dtype::float16>(result);
+  }
+};
+
+template <>
+struct FMinFunctor<int> {
+  inline HOSTDEVICE int operator()(const int a, const int b) const {
+    float float_a = static_cast<float>(a);
+    float float_b = static_cast<float>(b);
+    auto result = std::fmin(float_a, float_b);
+    return std::lrint(result);
+  }
+};
+
+template <>
+struct FMinFunctor<int64_t> {
+  inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const {
+    double double_a = static_cast<double>(a);
+    double double_b = static_cast<double>(b);
+    auto result = std::fmin(double_a, double_b);
+    return std::llrint(result);
+  }
+};
+
+// Fmax
+template <typename T>
+struct FMaxFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    return std::fmax(a, b);
+  }
+};
+
+template <>
+struct FMaxFunctor<dtype::float16> {
+  inline HOSTDEVICE dtype::float16 operator()(const dtype::float16 a,
+                                              const dtype::float16 b) const {
+    float float_a = static_cast<float>(a);
+    float float_b = static_cast<float>(b);
+    auto result = std::fmax(float_a, float_b);
+    return static_cast<dtype::float16>(result);
+  }
+};
+
+template <>
+struct FMaxFunctor<int> {
+  inline HOSTDEVICE int operator()(const int a, const int b) const {
+    float float_a = static_cast<float>(a);
+    float float_b = static_cast<float>(b);
+    auto result = std::fmax(float_a, float_b);
+    return std::lrint(result);
+  }
+};
+
+template <>
+struct FMaxFunctor<int64_t> {
+  inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const {
+    double double_a = static_cast<double>(a);
+    double double_b = static_cast<double>(b);
+    auto result = std::fmax(double_a, double_b);
+    return std::llrint(result);
+  }
+};
+
+template <typename T>
+struct FMaxGradDx {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * static_cast<T>((x >= y) || isnan(y));
+  }
+};
+
+template <>
+struct FMaxGradDx<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(dtype::float16 x,
+                                       dtype::float16 y,
+                                       dtype::float16 out,
+                                       dtype::float16 dout) const {
+    return dout * static_cast<dtype::float16>((x >= y) || dtype::isnan(y));
+  }
+};
+
+template <>
+struct FMaxGradDx<int> {
+  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
+    return dout * static_cast<int>((x >= y));
+  }
+};
+
+template <>
+struct FMaxGradDx<int64_t> {
+  HOSTDEVICE int64_t operator()(int64_t x,
+                                int64_t y,
+                                int64_t out,
+                                int64_t dout) const {
+    return dout * static_cast<int64_t>((x >= y));
+  }
+};
+
+template <typename T>
+struct FMaxGradDy {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * static_cast<T>(!((x >= y) || isnan(y)));
+  }
+};
+
+template <>
+struct FMaxGradDy<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(dtype::float16 x,
+                                       dtype::float16 y,
+                                       dtype::float16 out,
+                                       dtype::float16 dout) const {
+    return dout * static_cast<dtype::float16>(!((x >= y) || dtype::isnan(y)));
+  }
+};
+
+template <>
+struct FMaxGradDy<int64_t> {
+  HOSTDEVICE int64_t operator()(int64_t x,
+                                int64_t y,
+                                int64_t out,
+                                int64_t dout) const {
+    return dout * static_cast<int64_t>(!((x >= y)));
+  }
+};
+
+template <>
+struct FMaxGradDy<int> {
+  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
+    return dout * static_cast<int>(!((x >= y)));
+  }
+};
+
+template <typename T>
+struct FMinGradDx {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * static_cast<T>((x <= y) || isnan(y));
+  }
+};
+
+template <>
+struct FMinGradDx<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(dtype::float16 x,
+                                       dtype::float16 y,
+                                       dtype::float16 out,
+                                       dtype::float16 dout) const {
+    return dout * static_cast<dtype::float16>((x <= y) || dtype::isnan(y));
+  }
+};
+
+template <>
+struct FMinGradDx<int> {
+  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
+    return dout * static_cast<int>((x <= y));
+  }
+};
+
+template <>
+struct FMinGradDx<int64_t> {
+  HOSTDEVICE int64_t operator()(int64_t x,
+                                int64_t y,
+                                int64_t out,
+                                int64_t dout) const {
+    return dout * static_cast<int64_t>((x <= y));
+  }
+};
+
+template <typename T>
+struct FMinGradDy {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * static_cast<T>(!((x <= y) || isnan(y)));
+  }
+};
+
+template <>
+struct FMinGradDy<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(dtype::float16 x,
+                                       dtype::float16 y,
+                                       dtype::float16 out,
+                                       dtype::float16 dout) const {
+    return dout * static_cast<dtype::float16>(!((x <= y) || dtype::isnan(y)));
+  }
+};
+
+template <>
+struct FMinGradDy<int> {
+  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
+    return dout * static_cast<int>(!((x <= y)));
+  }
+};
+
+template <>
+struct FMinGradDy<int64_t> {
+  HOSTDEVICE int64_t operator()(int64_t x,
+                                int64_t y,
+                                int64_t out,
+                                int64_t dout) const {
+    return dout * static_cast<int64_t>(!((x <= y)));
+  }
+};
+
+template <typename T>
+struct MultiplyGradFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a * b; }
+};
+template <typename T>
+struct MultiplyGradFunctor<ComplexType<T>> {
+  inline HOSTDEVICE ComplexType<T> operator()(const ComplexType<T> a,
+                                              const ComplexType<T> b) const {
+    ComplexType<T> b_conj(b.real, -b.imag);
+    return a * b_conj;
+  }
+};
+
+template <typename InT, typename OutT>
+struct MultiplyGradXYFunctor {
+  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT a,
+                                                   const InT b,
+                                                   const InT c) {
+    phi::Array<OutT, 2> outs;
+    // dx = dout * y
+    outs[0] = a * b;
+    // dy = dout * x
+    outs[1] = a * c;
+    return outs;
+  }
+};
+
+template <typename InT, typename OutT>
+struct MultiplyGradXYFunctor<ComplexType<InT>, ComplexType<OutT>> {
+  inline HOSTDEVICE phi::Array<ComplexType<OutT>, 2> operator()(
+      const ComplexType<InT> a,
+      const ComplexType<InT> b,
+      const ComplexType<InT> c) {
+    phi::Array<ComplexType<OutT>, 2> outs;
+    // dx = dout * y
+    ComplexType<InT> b_conj(b.real, -b.imag);
+    outs[0] = a * b_conj;
+    // dy = dout * x
+    ComplexType<InT> c_conj(c.real, -c.imag);
+    outs[1] = a * c_conj;
+    return outs;
+  }
+};
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/lapack/CMakeLists.txt b/paddle/phi/kernels/funcs/lapack/CMakeLists.txt
index ffff5ae8abe2a..1a53470b2e604 100644
--- a/paddle/phi/kernels/funcs/lapack/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/lapack/CMakeLists.txt
@@ -1 +1 @@
-math_library(lapack_function DEPS dynload_lapack)
+math_library(lapack_function DEPS phi_dynload_lapack)
diff --git a/paddle/phi/kernels/funcs/lapack/lapack_function.cc b/paddle/phi/kernels/funcs/lapack/lapack_function.cc
index 0407b8fd48960..0f887dce4b4da 100644
--- a/paddle/phi/kernels/funcs/lapack/lapack_function.cc
+++ b/paddle/phi/kernels/funcs/lapack/lapack_function.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
-#include "paddle/fluid/platform/dynload/lapack.h"
+#include "paddle/phi/backends/dynload/lapack.h"
 #include "paddle/phi/common/complex.h"
 
 namespace phi {
@@ -22,12 +22,12 @@ namespace funcs {
 // LU (for example)
 template <>
 void lapackLu<double>(int m, int n, double *a, int lda, int *ipiv, int *info) {
-  paddle::platform::dynload::dgetrf_(&m, &n, a, &lda, ipiv, info);
+  dynload::dgetrf_(&m, &n, a, &lda, ipiv, info);
 }
 
 template <>
 void lapackLu<float>(int m, int n, float *a, int lda, int *ipiv, int *info) {
-  paddle::platform::dynload::sgetrf_(&m, &n, a, &lda, ipiv, info);
+  dynload::sgetrf_(&m, &n, a, &lda, ipiv, info);
 }
 
 // eigh
@@ -47,7 +47,7 @@ void lapackEigh<float>(char jobz,
                        int *info) {
   (void)rwork;   // unused
   (void)lrwork;  // unused
-  paddle::platform::dynload::ssyevd_(
+  dynload::ssyevd_(
       &jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork, &liwork, info);
 }
 
@@ -67,7 +67,7 @@ void lapackEigh<double>(char jobz,
                         int *info) {
   (void)rwork;   // unused
   (void)lrwork;  // unused
-  paddle::platform::dynload::dsyevd_(
+  dynload::dsyevd_(
       &jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork, &liwork, info);
 }
 
@@ -86,20 +86,19 @@ void lapackEigh<phi::dtype::complex<float>, float>(
     int *iwork,
     int liwork,
     int *info) {
-  paddle::platform::dynload::cheevd_(
-      &jobz,
-      &uplo,
-      &n,
-      reinterpret_cast<std::complex<float> *>(a),
-      &lda,
-      w,
-      reinterpret_cast<std::complex<float> *>(work),
-      &lwork,
-      rwork,
-      &lrwork,
-      iwork,
-      &liwork,
-      info);
+  dynload::cheevd_(&jobz,
+                   &uplo,
+                   &n,
+                   reinterpret_cast<std::complex<float> *>(a),
+                   &lda,
+                   w,
+                   reinterpret_cast<std::complex<float> *>(work),
+                   &lwork,
+                   rwork,
+                   &lrwork,
+                   iwork,
+                   &liwork,
+                   info);
 }
 
 template <>
@@ -117,20 +116,19 @@ void lapackEigh<phi::dtype::complex<double>, double>(
     int *iwork,
     int liwork,
     int *info) {
-  paddle::platform::dynload::zheevd_(
-      &jobz,
-      &uplo,
-      &n,
-      reinterpret_cast<std::complex<double> *>(a),
-      &lda,
-      w,
-      reinterpret_cast<std::complex<double> *>(work),
-      &lwork,
-      rwork,
-      &lrwork,
-      iwork,
-      &liwork,
-      info);
+  dynload::zheevd_(&jobz,
+                   &uplo,
+                   &n,
+                   reinterpret_cast<std::complex<double> *>(a),
+                   &lda,
+                   w,
+                   reinterpret_cast<std::complex<double> *>(work),
+                   &lwork,
+                   rwork,
+                   &lrwork,
+                   iwork,
+                   &liwork,
+                   info);
 }
 
 // Eig
@@ -152,20 +150,20 @@ void lapackEig<double>(char jobvl,
   double *wr = w;
   double *wi = w + n;
   (void)rwork;  // unused
-  paddle::platform::dynload::dgeev_(&jobvl,
-                                    &jobvr,
-                                    &n,
-                                    a,
-                                    &lda,
-                                    wr,
-                                    wi,
-                                    vl,
-                                    &ldvl,
-                                    vr,
-                                    &ldvr,
-                                    work,
-                                    &lwork,
-                                    info);
+  dynload::dgeev_(&jobvl,
+                  &jobvr,
+                  &n,
+                  a,
+                  &lda,
+                  wr,
+                  wi,
+                  vl,
+                  &ldvl,
+                  vr,
+                  &ldvr,
+                  work,
+                  &lwork,
+                  info);
 }
 
 template <>
@@ -186,20 +184,20 @@ void lapackEig<float>(char jobvl,
   float *wr = w;
   float *wi = w + n;
   (void)rwork;  // unused
-  paddle::platform::dynload::sgeev_(&jobvl,
-                                    &jobvr,
-                                    &n,
-                                    a,
-                                    &lda,
-                                    wr,
-                                    wi,
-                                    vl,
-                                    &ldvl,
-                                    vr,
-                                    &ldvr,
-                                    work,
-                                    &lwork,
-                                    info);
+  dynload::sgeev_(&jobvl,
+                  &jobvr,
+                  &n,
+                  a,
+                  &lda,
+                  wr,
+                  wi,
+                  vl,
+                  &ldvl,
+                  vr,
+                  &ldvr,
+                  work,
+                  &lwork,
+                  info);
 }
 
 template <>
@@ -218,21 +216,20 @@ void lapackEig<phi::dtype::complex<double>, double>(
     int lwork,
     double *rwork,
     int *info) {
-  paddle::platform::dynload::zgeev_(
-      &jobvl,
-      &jobvr,
-      &n,
-      reinterpret_cast<std::complex<double> *>(a),
-      &lda,
-      reinterpret_cast<std::complex<double> *>(w),
-      reinterpret_cast<std::complex<double> *>(vl),
-      &ldvl,
-      reinterpret_cast<std::complex<double> *>(vr),
-      &ldvr,
-      reinterpret_cast<std::complex<double> *>(work),
-      &lwork,
-      rwork,
-      info);
+  dynload::zgeev_(&jobvl,
+                  &jobvr,
+                  &n,
+                  reinterpret_cast<std::complex<double> *>(a),
+                  &lda,
+                  reinterpret_cast<std::complex<double> *>(w),
+                  reinterpret_cast<std::complex<double> *>(vl),
+                  &ldvl,
+                  reinterpret_cast<std::complex<double> *>(vr),
+                  &ldvr,
+                  reinterpret_cast<std::complex<double> *>(work),
+                  &lwork,
+                  rwork,
+                  info);
 }
 
 template <>
@@ -251,21 +248,20 @@ void lapackEig<phi::dtype::complex<float>, float>(
     int lwork,
     float *rwork,
     int *info) {
-  paddle::platform::dynload::cgeev_(
-      &jobvl,
-      &jobvr,
-      &n,
-      reinterpret_cast<std::complex<float> *>(a),
-      &lda,
-      reinterpret_cast<std::complex<float> *>(w),
-      reinterpret_cast<std::complex<float> *>(vl),
-      &ldvl,
-      reinterpret_cast<std::complex<float> *>(vr),
-      &ldvr,
-      reinterpret_cast<std::complex<float> *>(work),
-      &lwork,
-      rwork,
-      info);
+  dynload::cgeev_(&jobvl,
+                  &jobvr,
+                  &n,
+                  reinterpret_cast<std::complex<float> *>(a),
+                  &lda,
+                  reinterpret_cast<std::complex<float> *>(w),
+                  reinterpret_cast<std::complex<float> *>(vl),
+                  &ldvl,
+                  reinterpret_cast<std::complex<float> *>(vr),
+                  &ldvr,
+                  reinterpret_cast<std::complex<float> *>(work),
+                  &lwork,
+                  rwork,
+                  info);
 }
 
 template <>
@@ -280,8 +276,7 @@ void lapackGels<double>(char trans,
                         double *work,
                         int lwork,
                         int *info) {
-  paddle::platform::dynload::dgels_(
-      &trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
+  dynload::dgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
 }
 
 template <>
@@ -296,8 +291,7 @@ void lapackGels<float>(char trans,
                        float *work,
                        int lwork,
                        int *info) {
-  paddle::platform::dynload::sgels_(
-      &trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
+  dynload::sgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
 }
 
 template <>
@@ -316,20 +310,20 @@ void lapackGelsd<double>(int m,
                          double *rwork,
                          int *iwork,
                          int *info) {
-  paddle::platform::dynload::dgelsd_(&m,
-                                     &n,
-                                     &nrhs,
-                                     a,
-                                     &lda,
-                                     b,
-                                     &ldb,
-                                     s,
-                                     &rcond,
-                                     rank,
-                                     work,
-                                     &lwork,
-                                     iwork,
-                                     info);
+  dynload::dgelsd_(&m,
+                   &n,
+                   &nrhs,
+                   a,
+                   &lda,
+                   b,
+                   &ldb,
+                   s,
+                   &rcond,
+                   rank,
+                   work,
+                   &lwork,
+                   iwork,
+                   info);
 }
 
 template <>
@@ -348,20 +342,20 @@ void lapackGelsd<float>(int m,
                         float *rwork,
                         int *iwork,
                         int *info) {
-  paddle::platform::dynload::sgelsd_(&m,
-                                     &n,
-                                     &nrhs,
-                                     a,
-                                     &lda,
-                                     b,
-                                     &ldb,
-                                     s,
-                                     &rcond,
-                                     rank,
-                                     work,
-                                     &lwork,
-                                     iwork,
-                                     info);
+  dynload::sgelsd_(&m,
+                   &n,
+                   &nrhs,
+                   a,
+                   &lda,
+                   b,
+                   &ldb,
+                   s,
+                   &rcond,
+                   rank,
+                   work,
+                   &lwork,
+                   iwork,
+                   info);
 }
 
 template <>
@@ -379,7 +373,7 @@ void lapackGelsy<double>(int m,
                          int lwork,
                          double *rwork,
                          int *info) {
-  paddle::platform::dynload::dgelsy_(
+  dynload::dgelsy_(
       &m, &n, &nrhs, a, &lda, b, &ldb, jpvt, &rcond, rank, work, &lwork, info);
 }
 
@@ -398,7 +392,7 @@ void lapackGelsy<float>(int m,
                         int lwork,
                         float *rwork,
                         int *info) {
-  paddle::platform::dynload::sgelsy_(
+  dynload::sgelsy_(
       &m, &n, &nrhs, a, &lda, b, &ldb, jpvt, &rcond, rank, work, &lwork, info);
 }
 
@@ -417,7 +411,7 @@ void lapackGelss<double>(int m,
                          int lwork,
                          double *rwork,
                          int *info) {
-  paddle::platform::dynload::dgelss_(
+  dynload::dgelss_(
       &m, &n, &nrhs, a, &lda, b, &ldb, s, &rcond, rank, work, &lwork, info);
 }
 
@@ -436,7 +430,7 @@ void lapackGelss<float>(int m,
                         int lwork,
                         float *rwork,
                         int *info) {
-  paddle::platform::dynload::sgelss_(
+  dynload::sgelss_(
       &m, &n, &nrhs, a, &lda, b, &ldb, s, &rcond, rank, work, &lwork, info);
 }
 
@@ -450,15 +444,14 @@ void lapackCholeskySolve<phi::dtype::complex<double>>(
     phi::dtype::complex<double> *b,
     int ldb,
     int *info) {
-  paddle::platform::dynload::zpotrs_(
-      &uplo,
-      &n,
-      &nrhs,
-      reinterpret_cast<std::complex<double> *>(a),
-      &lda,
-      reinterpret_cast<std::complex<double> *>(b),
-      &ldb,
-      info);
+  dynload::zpotrs_(&uplo,
+                   &n,
+                   &nrhs,
+                   reinterpret_cast<std::complex<double> *>(a),
+                   &lda,
+                   reinterpret_cast<std::complex<double> *>(b),
+                   &ldb,
+                   info);
 }
 
 template <>
@@ -471,14 +464,14 @@ void lapackCholeskySolve<phi::dtype::complex<float>>(
     phi::dtype::complex<float> *b,
     int ldb,
     int *info) {
-  paddle::platform::dynload::cpotrs_(&uplo,
-                                     &n,
-                                     &nrhs,
-                                     reinterpret_cast<std::complex<float> *>(a),
-                                     &lda,
-                                     reinterpret_cast<std::complex<float> *>(b),
-                                     &ldb,
-                                     info);
+  dynload::cpotrs_(&uplo,
+                   &n,
+                   &nrhs,
+                   reinterpret_cast<std::complex<float> *>(a),
+                   &lda,
+                   reinterpret_cast<std::complex<float> *>(b),
+                   &ldb,
+                   info);
 }
 
 template <>
@@ -490,7 +483,7 @@ void lapackCholeskySolve<double>(char uplo,
                                  double *b,
                                  int ldb,
                                  int *info) {
-  paddle::platform::dynload::dpotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
+  dynload::dpotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
 }
 
 template <>
@@ -502,7 +495,7 @@ void lapackCholeskySolve<float>(char uplo,
                                 float *b,
                                 int ldb,
                                 int *info) {
-  paddle::platform::dynload::spotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
+  dynload::spotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
 }
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/funcs/layer_norm_util.h b/paddle/phi/kernels/funcs/layer_norm_util.h
new file mode 100644
index 0000000000000..e78730cbf3849
--- /dev/null
+++ b/paddle/phi/kernels/funcs/layer_norm_util.h
@@ -0,0 +1,165 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+namespace funcs {
+
+// Wrap RowwiseMean and ColwiseMean.
+// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is
+// significantly faster. Unlike the RowwiseMean and ColwiseMean, the
+// implementation only considers 2D.
+template <typename DeviceContext, typename T>
+struct RowwiseMean2D {
+  RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx);
+
+  void operator()(const DeviceContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* vec);
+};
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename T>
+class RowwiseMean2D<phi::GPUContext, T> {
+ public:
+  RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx)
+      : left_(left), right_(right) {
+    DDim ones_dim({right_});
+    divisor_.Resize(ones_dim);
+    dev_ctx.template Alloc<T>(&divisor_);
+    phi::funcs::set_constant(dev_ctx, &divisor_, 1.0 / right);
+  }
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* out) {
+    phi::funcs::GetBlas<phi::GPUContext, T>(context).GEMV(false,
+                                                          left_,
+                                                          right_,
+                                                          1.,
+                                                          input.data<T>(),
+                                                          divisor_.data<T>(),
+                                                          0.,
+                                                          out->data<T>());
+  }
+
+ private:
+  int left_;
+  int right_;
+  DenseTensor divisor_;
+};
+#endif
+
+template <typename T>
+class RowwiseMean2D<phi::CPUContext, T> {
+ public:
+  RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx) {}
+
+  void operator()(const phi::CPUContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* out) {
+    row_mean_(context, input, out);
+  }
+
+ private:
+  phi::funcs::RowwiseMean<phi::CPUContext, T> row_mean_;
+};
+
+template <typename DeviceContext, typename T>
+struct ColwiseSum2D {
+  ColwiseSum2D(int left, int right, const DeviceContext& dev_ctx);
+
+  void operator()(const phi::DeviceContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* vec);
+};
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename T>
+class ColwiseSum2D<phi::GPUContext, T> {
+ public:
+  ColwiseSum2D(int left, int right, const phi::GPUContext& dev_ctx)
+      : left_(left), right_(right) {
+    DDim ones_dim({left_});
+    divisor_.Resize(ones_dim);
+    dev_ctx.template Alloc<T>(&divisor_);
+    phi::funcs::set_constant(dev_ctx, &divisor_, 1.0);
+  }
+
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* out) {
+    phi::funcs::GetBlas<phi::GPUContext, T>(context).GEMV(true,
+                                                          left_,
+                                                          right_,
+                                                          1.,
+                                                          input.data<T>(),
+                                                          divisor_.data<T>(),
+                                                          0.,
+                                                          out->data<T>());
+  }
+
+ private:
+  int left_;
+  int right_;
+  DenseTensor divisor_;
+};
+#endif
+
+template <typename T>
+class ColwiseSum2D<phi::CPUContext, T> {
+ public:
+  ColwiseSum2D(int left, int right, const phi::CPUContext& dev_ctx) {}
+
+  void operator()(const phi::CPUContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* out) {
+    col_wise_(context, input, out);
+  }
+
+ private:
+  phi::funcs::ColwiseSum<phi::CPUContext, T> col_wise_;
+};
+
+template <typename T>
+struct SubAndSquareFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
+};
+
+template <typename T>
+struct DivAndSqrtFunctor {
+  explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; }
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return a / (sqrt(b + epsilon_));
+  }
+
+ private:
+  T epsilon_;
+};
+
+template <typename T>
+struct MulInvVarFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return a * std::sqrt(1.0 / b);
+  }
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index 4201a75be8ac7..afa2214f5b9df 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -331,12 +331,20 @@ template struct ColwiseSum<paddle::platform::CPUDeviceContext, double>;
 template struct ColwiseSum<paddle::platform::CPUDeviceContext, int>;
 template struct ColwiseSum<paddle::platform::CPUDeviceContext, int64_t>;
 
+template struct ColwiseSum<phi::CPUContext, float>;
+template struct ColwiseSum<phi::CPUContext, double>;
+template struct ColwiseSum<phi::CPUContext, int>;
+template struct ColwiseSum<phi::CPUContext, int64_t>;
+
 template struct RowwiseSum<paddle::platform::CPUDeviceContext, float>;
 template struct RowwiseSum<paddle::platform::CPUDeviceContext, double>;
 
 template struct RowwiseMean<paddle::platform::CPUDeviceContext, float>;
 template struct RowwiseMean<paddle::platform::CPUDeviceContext, double>;
 
+template struct RowwiseMean<phi::CPUContext, float>;
+template struct RowwiseMean<phi::CPUContext, double>;
+
 template <typename T>
 struct ElementwiseAddTo<paddle::platform::CPUDeviceContext, T> {
   void operator()(paddle::platform::CPUDeviceContext* ctx,
diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h
index 8e1a4cdd1a968..b735587d3d53d 100644
--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -125,5 +125,43 @@ struct TensorSetConstantXPU {
 };
 #endif
 
+template <typename Context, typename T>
+inline void TransCompute(const int dim,
+                         const Context& dev_ctx,
+                         const DenseTensor& in,
+                         DenseTensor* out,
+                         const std::vector<int>& axis) {
+  switch (dim) {
+    case 1:
+      Transpose<Context, T, 1> trans1;
+      trans1(dev_ctx, in, out, axis);
+      break;
+    case 2:
+      Transpose<Context, T, 2> trans2;
+      trans2(dev_ctx, in, out, axis);
+      break;
+    case 3:
+      Transpose<Context, T, 3> trans3;
+      trans3(dev_ctx, in, out, axis);
+      break;
+    case 4:
+      Transpose<Context, T, 4> trans4;
+      trans4(dev_ctx, in, out, axis);
+      break;
+    case 5:
+      Transpose<Context, T, 5> trans5;
+      trans5(dev_ctx, in, out, axis);
+      break;
+    case 6:
+      Transpose<Context, T, 6> trans6;
+      trans6(dev_ctx, in, out, axis);
+      break;
+    default:
+      // for dim >= 7 situation
+      TransposeNormal<Context, T> trans_normal;
+      trans_normal(dev_ctx, in, out, axis);
+  }
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.h b/paddle/phi/kernels/funcs/matrix_inverse.h
index c5b04a8106561..1c6756f1720a2 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.h
+++ b/paddle/phi/kernels/funcs/matrix_inverse.h
@@ -39,7 +39,7 @@ void ComputeInverseEigen(const Context& dev_ctx,
   int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
 
   const T* a_ptr = a.data<T>();
-  T* a_inv_ptr = a_inv->mutable_data<T>(dev_ctx.GetPlace());
+  T* a_inv_ptr = dev_ctx.template Alloc<T>(a_inv);
 
   for (int i = 0; i < batch_size; ++i) {
     ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n);
diff --git a/paddle/phi/kernels/funcs/mode.h b/paddle/phi/kernels/funcs/mode.h
new file mode 100644
index 0000000000000..3bd6c19545e16
--- /dev/null
+++ b/paddle/phi/kernels/funcs/mode.h
@@ -0,0 +1,197 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/extrema.h>
+#include <thrust/functional.h>
+#include <thrust/inner_product.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#endif
+
+#include <algorithm>
+#include <cmath>
+#include <utility>
+#include <vector>
+#ifdef PADDLE_WITH_MKLML
+#include <omp.h>
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+namespace funcs {
+
+static int ComputeBlockSize(int col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256 && col <= 512)
+    return 512;
+  else if (col > 128 && col <= 256)
+    return 256;
+  else if (col > 64 && col <= 128)
+    return 128;
+  else
+    return 64;
+}
+
+static inline void GetDims(
+    const phi::DDim& dim, int axis, int* pre, int* n, int* post) {
+  *pre = 1;
+  *post = 1;
+  *n = dim[axis];
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= dim[i];
+  }
+  for (int i = axis + 1; i < dim.size(); ++i) {
+    (*post) *= dim[i];
+  }
+}
+
+template <typename T, typename Type>
+static void GetMode(Type input_height,
+                    Type input_width,
+                    int input_dim,
+                    const DenseTensor* input,
+                    T* t_out,
+                    Type* t_indices) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    std::sort(col_vec.begin(),
+              col_vec.end(),
+              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                return (!std::isnan(static_cast<double>(l.first)) &&
+                        std::isnan(static_cast<double>(r.first))) ||
+                       (l.first < r.first);
+              });
+    T mode = 0;
+    int64_t indice = 0;
+    int64_t cur_freq = 0;
+    int64_t max_freq = 0;
+    for (int64_t i = 0; i < input_width; ++i) {
+      ++cur_freq;
+      if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) {
+        if (cur_freq > max_freq) {
+          max_freq = cur_freq;
+          mode = col_vec[i].first;
+          indice = col_vec[i].second;
+        }
+        cur_freq = 0;
+      }
+    }
+    t_out[i] = mode;
+    t_indices[i] = indice;
+  }
+}
+
+template <typename T, typename Type>
+static void ModeAssign(const Type& input_height,
+                       const Type& input_width,
+                       const int& input_dim,
+                       const DenseTensor* input,
+                       const DenseTensor* indices,
+                       T* output_data) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      output_data[i * input_width + e_indices(0)] = e_input(0);
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
+    }
+  }
+}
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+template <typename T>
+static void GetModebySort(const phi::GPUContext& dev_ctx,
+                          const DenseTensor* input_tensor,
+                          const int64_t num_cols,
+                          const int64_t num_rows,
+                          T* out_tensor,
+                          int64_t* indices_tensor) {
+  DenseTensor input_tmp;
+  input_tmp.Resize(phi::make_ddim({num_rows, num_cols}));
+  T* input_tmp_data = dev_ctx.Alloc<T>(&input_tmp);
+  phi::Copy(dev_ctx, *input_tensor, dev_ctx.GetPlace(), false, &input_tmp);
+
+  thrust::device_ptr<T> out_tensor_ptr(out_tensor);
+  thrust::device_ptr<int64_t> indices_tensor_ptr(indices_tensor);
+
+  for (int64_t i = 0; i < num_rows; ++i) {
+    T* begin = input_tmp_data + num_cols * i;
+    T* end = input_tmp_data + num_cols * (i + 1);
+    thrust::device_vector<int64_t> indices_data(num_cols);
+    thrust::sequence(
+        thrust::device, indices_data.begin(), indices_data.begin() + num_cols);
+    thrust::sort_by_key(thrust::device, begin, end, indices_data.begin());
+    int unique = 1 + thrust::inner_product(thrust::device,
+                                           begin,
+                                           end - 1,
+                                           begin + 1,
+                                           0,
+                                           thrust::plus<int>(),
+                                           thrust::not_equal_to<T>());
+    thrust::device_vector<T> keys_data(unique);
+    thrust::device_vector<int64_t> cnts_data(unique);
+    thrust::reduce_by_key(thrust::device,
+                          begin,
+                          end,
+                          thrust::constant_iterator<int>(1),
+                          keys_data.begin(),
+                          cnts_data.begin());
+    auto it = thrust::max_element(
+        thrust::device, cnts_data.begin(), cnts_data.begin() + unique);
+    T mode = keys_data[it - cnts_data.begin()];
+    int64_t counts = cnts_data[it - cnts_data.begin()];
+    auto pos = thrust::find(thrust::device, begin, end, mode);
+    int64_t index = indices_data[pos - begin + counts - 1];
+    out_tensor_ptr[i] = static_cast<T>(mode);
+    indices_tensor_ptr[i] = static_cast<int64_t>(index);
+  }
+}
+#endif
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/phi/kernels/funcs/pooling.cc
similarity index 83%
rename from paddle/fluid/operators/math/pooling.cc
rename to paddle/phi/kernels/funcs/pooling.cc
index f2e5e955ec487..10c88b9798c6f 100644
--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/phi/kernels/funcs/pooling.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,11 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/math/pooling.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+#include "paddle/phi/kernels/funcs/pooling.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+
+namespace phi {
+namespace funcs {
 
 /*
 * Tensors are in NCHW or NHWC format.
@@ -25,13 +29,16 @@ namespace math {
 * height_down, width_left and width_right, respectively.
 */
 template <typename PoolProcess, typename T>
-class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+class Pool2dFunctor<CPUContext, PoolProcess, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* output,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
                   PoolProcess pool_process) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
@@ -50,7 +57,7 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const int output_stride = output_height * output_width;
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int hstart = 0, hend = 1;
     int wstart = 0, wend = 1;
@@ -101,12 +108,16 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     }
   }
 
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* output, PoolProcess pool_process) {
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_process) {
     bool channel_last = (data_format == "NHWC");
 
     const int batch_size = input.dims()[0];
@@ -131,7 +142,7 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const int padding_width = paddings[1];
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int hstart = 0, hend = 1;
     int wstart = 0, wend = 1;
@@ -244,14 +255,19 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
 * height_down, width_left and width_right, respectively.
 */
 template <typename PoolProcess, class T>
-class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+class Pool2dGradFunctor<CPUContext, PoolProcess, T> {
  public:
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, bool exclusive, bool adaptive,
-      framework::Tensor* input_grad, PoolProcess pool_grad_process) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_grad_process) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -270,7 +286,7 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int hstart = 0, hend = 1;
     int wstart = 0, wend = 1;
@@ -324,13 +340,18 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     }
   }
 
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, const std::string data_format,
-      bool exclusive, bool adaptive, framework::Tensor* input_grad,
-      PoolProcess pool_grad_process) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_grad_process) {
     bool channel_last = (data_format == "NHWC");
 
     const int batch_size = input.dims()[0];
@@ -357,7 +378,7 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int hstart = 0, hend = 1;
     int wstart = 0, wend = 1;
@@ -451,10 +472,11 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                       h * input_width * input_channels + w * input_channels + c;
                   auto output_idx = ph * output_width * output_channels +
                                     pw * output_channels + c;
-                  pool_grad_process.compute(
-                      input_data[input_idx], output_data[output_idx],
-                      output_grad_data[output_idx], static_cast<T>(scale),
-                      input_grad_data + input_idx);
+                  pool_grad_process.compute(input_data[input_idx],
+                                            output_data[output_idx],
+                                            output_grad_data[output_idx],
+                                            static_cast<T>(scale),
+                                            input_grad_data + input_idx);
                 }
               }
             }
@@ -477,13 +499,16 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
 * height_down, width_left and width_right, respectively.
 */
 template <class T>
-class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
+class MaxPool2dGradFunctor<CPUContext, T> {
  public:
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, framework::Tensor* input_grad) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  DenseTensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -502,7 +527,7 @@ class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -536,12 +561,15 @@ class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
     }
   }
 
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, const std::string data_format,
-      framework::Tensor* input_grad) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  DenseTensor* input_grad) {
     bool channel_last = (data_format == "NHWC");
 
     const int batch_size = input.dims()[0];
@@ -568,7 +596,7 @@ class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     if (!channel_last) {
       const int input_stride = input_height * input_width;
@@ -641,29 +669,17 @@ class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
     }
   }
 };
-template class MaxPool2dGradFunctor<platform::CPUDeviceContext, float>;
-template class MaxPool2dGradFunctor<platform::CPUDeviceContext, double>;
-
-template class Pool2dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool2dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::AvgPool<float>, float>;
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool2dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool2dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::AvgPool<double>, double>;
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<double>,
-                                 double>;
+template class MaxPool2dGradFunctor<CPUContext, float>;
+template class MaxPool2dGradFunctor<CPUContext, double>;
+
+template class Pool2dFunctor<CPUContext, MaxPool<float>, float>;
+template class Pool2dFunctor<CPUContext, AvgPool<float>, float>;
+template class Pool2dGradFunctor<CPUContext, MaxPoolGrad<float>, float>;
+template class Pool2dGradFunctor<CPUContext, AvgPoolGrad<float>, float>;
+template class Pool2dFunctor<CPUContext, MaxPool<double>, double>;
+template class Pool2dFunctor<CPUContext, AvgPool<double>, double>;
+template class Pool2dGradFunctor<CPUContext, MaxPoolGrad<double>, double>;
+template class Pool2dGradFunctor<CPUContext, AvgPoolGrad<double>, double>;
 
 /*
 * Tensors are in NCDHW or NDHWC format.
@@ -674,13 +690,16 @@ template class Pool2dGradFunctor<platform::CPUDeviceContext,
 * height_up, height_down, width_left and width_right, respectively.
 */
 template <typename PoolProcess, class T>
-class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+class Pool3dFunctor<CPUContext, PoolProcess, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* output,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
                   PoolProcess pool_process) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
@@ -704,7 +723,7 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const int output_stride = output_depth * output_height * output_width;
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int dstart = 0, dend = 1;
     int hstart = 0, hend = 1;
@@ -771,12 +790,16 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
       }
     }
   }
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* output, PoolProcess pool_process) {
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_process) {
     bool channel_last = (data_format == "NDHWC");
     const int batch_size = input.dims()[0];
 
@@ -807,7 +830,7 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const int padding_width = paddings[2];
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int dstart = 0, dend = 1;
     int hstart = 0, hend = 1;
@@ -966,14 +989,19 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
 * height_up, height_down, width_left and width_right, respectively.
 */
 template <typename PoolProcess, class T>
-class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+class Pool3dGradFunctor<CPUContext, PoolProcess, T> {
  public:
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, bool exclusive, bool adaptive,
-      framework::Tensor* input_grad, PoolProcess pool_grad_process) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_grad_process) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -997,7 +1025,7 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int dstart = 0, dend = 1;
     int hstart = 0, hend = 1;
@@ -1051,10 +1079,11 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                     int input_idx = (d * input_height + h) * input_width + w;
                     int output_idx =
                         (pd * output_height + ph) * output_width + pw;
-                    pool_grad_process.compute(
-                        input_data[input_idx], output_data[output_idx],
-                        output_grad_data[output_idx], static_cast<T>(scale),
-                        input_grad_data + input_idx);
+                    pool_grad_process.compute(input_data[input_idx],
+                                              output_data[output_idx],
+                                              output_grad_data[output_idx],
+                                              static_cast<T>(scale),
+                                              input_grad_data + input_idx);
                   }
                 }
               }
@@ -1068,13 +1097,18 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
       }
     }
   }
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, const std::string data_format,
-      bool exclusive, bool adaptive, framework::Tensor* input_grad,
-      PoolProcess pool_grad_process) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_grad_process) {
     bool channel_last = (data_format == "NDHWC");
 
     const int batch_size = input.dims()[0];
@@ -1105,7 +1139,7 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int dstart = 0, dend = 1;
     int hstart = 0, hend = 1;
@@ -1164,10 +1198,11 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                       int input_idx = (d * input_height + h) * input_width + w;
                       int output_idx =
                           (pd * output_height + ph) * output_width + pw;
-                      pool_grad_process.compute(
-                          input_data[input_idx], output_data[output_idx],
-                          output_grad_data[output_idx], static_cast<T>(scale),
-                          input_grad_data + input_idx);
+                      pool_grad_process.compute(input_data[input_idx],
+                                                output_data[output_idx],
+                                                output_grad_data[output_idx],
+                                                static_cast<T>(scale),
+                                                input_grad_data + input_idx);
                     }
                   }
                 }
@@ -1241,10 +1276,11 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                           ((pd * output_height + ph) * output_width + pw) *
                               output_channels +
                           c;
-                      pool_grad_process.compute(
-                          input_data[input_idx], output_data[output_idx],
-                          output_grad_data[output_idx], static_cast<T>(scale),
-                          input_grad_data + input_idx);
+                      pool_grad_process.compute(input_data[input_idx],
+                                                output_data[output_idx],
+                                                output_grad_data[output_idx],
+                                                static_cast<T>(scale),
+                                                input_grad_data + input_idx);
                     }
                   }
                 }
@@ -1270,13 +1306,16 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
 * height_up, height_down, width_left and width_right, respectively.
 */
 template <class T>
-class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
+class MaxPool3dGradFunctor<CPUContext, T> {
  public:
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, framework::Tensor* input_grad) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  DenseTensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -1300,7 +1339,7 @@ class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -1342,12 +1381,15 @@ class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
       }
     }
   }
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, const std::string data_format,
-      framework::Tensor* input_grad) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  DenseTensor* input_grad) {
     bool channel_last = (data_format == "NDHWC");
     const int batch_size = input.dims()[0];
 
@@ -1378,7 +1420,7 @@ class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     if (!channel_last) {
       const int input_stride = input_depth * input_height * input_width;
@@ -1475,29 +1517,17 @@ class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
     }
   }
 };
-template class MaxPool3dGradFunctor<platform::CPUDeviceContext, float>;
-template class MaxPool3dGradFunctor<platform::CPUDeviceContext, double>;
-
-template class Pool3dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool3dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::AvgPool<float>, float>;
-template class Pool3dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool3dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool3dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool3dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::AvgPool<double>, double>;
-template class Pool3dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool3dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<double>,
-                                 double>;
+template class MaxPool3dGradFunctor<CPUContext, float>;
+template class MaxPool3dGradFunctor<CPUContext, double>;
+
+template class Pool3dFunctor<CPUContext, MaxPool<float>, float>;
+template class Pool3dFunctor<CPUContext, AvgPool<float>, float>;
+template class Pool3dGradFunctor<CPUContext, MaxPoolGrad<float>, float>;
+template class Pool3dGradFunctor<CPUContext, AvgPoolGrad<float>, float>;
+template class Pool3dFunctor<CPUContext, MaxPool<double>, double>;
+template class Pool3dFunctor<CPUContext, AvgPool<double>, double>;
+template class Pool3dGradFunctor<CPUContext, MaxPoolGrad<double>, double>;
+template class Pool3dGradFunctor<CPUContext, AvgPoolGrad<double>, double>;
 
 /*
  * All tensors are in NCHW format.
@@ -1505,13 +1535,16 @@ template class Pool3dGradFunctor<platform::CPUDeviceContext,
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
+class MaxPool2dWithIndexFunctor<CPUContext, T1, T2> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* output,
+                  DenseTensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -1528,8 +1561,8 @@ class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
     const int output_stride = output_height * output_width;
 
     const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(context.GetPlace());
-    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
+    T1* output_data = context.template Alloc<T1>(output);
+    T2* mask_data = context.template Alloc<T2>(mask);
 
     int hstart, hend;
     int wstart, wend;
@@ -1583,14 +1616,16 @@ class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
+class MaxPool2dWithIndexGradFunctor<CPUContext, T1, T2> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& output_grad,
+                  const DenseTensor& mask,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_height = input_grad->dims()[2];
     const int input_width = input_grad->dims()[3];
@@ -1602,7 +1637,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
 
     const T2* mask_data = mask.data<T2>();
     const T1* output_grad_data = output_grad.data<T1>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
+    T1* input_grad_data = context.template Alloc<T1>(input_grad);
 
     for (int n = 0; n < batch_size; ++n) {
       for (int c = 0; c < output_channels; ++c) {
@@ -1622,14 +1657,10 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
   }
 };
 
-template class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, float,
-                                         int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, float,
-                                             int>;
-template class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, double,
-                                         int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, double,
-                                             int>;
+template class MaxPool2dWithIndexFunctor<CPUContext, float, int>;
+template class MaxPool2dWithIndexGradFunctor<CPUContext, float, int>;
+template class MaxPool2dWithIndexFunctor<CPUContext, double, int>;
+template class MaxPool2dWithIndexGradFunctor<CPUContext, double, int>;
 
 /*
  * All tensors are in NCDHW format.
@@ -1637,13 +1668,16 @@ template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, double,
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
+class MaxPool3dWithIndexFunctor<CPUContext, T1, T2> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* output,
+                  DenseTensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -1665,8 +1699,8 @@ class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
     const int output_stride = output_depth * output_height * output_width;
 
     const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(context.GetPlace());
-    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
+    T1* output_data = context.template Alloc<T1>(output);
+    T2* mask_data = context.template Alloc<T2>(mask);
 
     int dstart, dend;
     int hstart, hend;
@@ -1735,14 +1769,16 @@ class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
+class MaxPool3dWithIndexGradFunctor<CPUContext, T1, T2> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& output_grad,
+                  const DenseTensor& mask,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_depth = input_grad->dims()[2];
     const int input_height = input_grad->dims()[3];
@@ -1756,7 +1792,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
 
     const T2* mask_data = mask.data<T2>();
     const T1* output_grad_data = output_grad.data<T1>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
+    T1* input_grad_data = context.template Alloc<T1>(input_grad);
 
     for (int n = 0; n < batch_size; ++n) {
       for (int c = 0; c < output_channels; ++c) {
@@ -1779,14 +1815,9 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
   }
 };
 
-template class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, float,
-                                         int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, float,
-                                             int>;
-template class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, double,
-                                         int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, double,
-                                             int>;
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+template class MaxPool3dWithIndexFunctor<CPUContext, float, int>;
+template class MaxPool3dWithIndexGradFunctor<CPUContext, float, int>;
+template class MaxPool3dWithIndexFunctor<CPUContext, double, int>;
+template class MaxPool3dWithIndexGradFunctor<CPUContext, double, int>;
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu
similarity index 54%
rename from paddle/fluid/operators/math/pooling.cu
rename to paddle/phi/kernels/funcs/pooling.cu
index 9d96345eb1f6d..417c1cd234754 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/phi/kernels/funcs/pooling.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 paddlepaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,63 +12,72 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/phi/kernels/funcs/pooling.h"
+
 #include <algorithm>
 #include <vector>
-
-#include "paddle/fluid/operators/math/pooling.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/fast_divmod.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 
 struct FastDivModForPooling {
  public:
-  platform::FastDivMod channel;
-  platform::FastDivMod width;
-  platform::FastDivMod height;
+  paddle::platform::FastDivMod channel;
+  paddle::platform::FastDivMod width;
+  paddle::platform::FastDivMod height;
 
   explicit HOSTDEVICE FastDivModForPooling(const int channels,
                                            const int output_width,
                                            const int output_height) {
-    channel = platform::FastDivMod(channels);
-    width = platform::FastDivMod(output_width);
-    height = platform::FastDivMod(output_height);
+    channel = paddle::platform::FastDivMod(channels);
+    width = paddle::platform::FastDivMod(output_width);
+    height = paddle::platform::FastDivMod(output_height);
   }
 };
 
 struct FastDivModForPoolingWithMoreStaff {
  public:
-  platform::FastDivMod channel;
-  platform::FastDivMod width;
-  platform::FastDivMod height;
-  platform::FastDivMod ksize_w;
-  platform::FastDivMod ksize_h;
-  platform::FastDivMod stride_w;
-  platform::FastDivMod stride_h;
+  paddle::platform::FastDivMod channel;
+  paddle::platform::FastDivMod width;
+  paddle::platform::FastDivMod height;
+  paddle::platform::FastDivMod ksize_w;
+  paddle::platform::FastDivMod ksize_h;
+  paddle::platform::FastDivMod stride_w;
+  paddle::platform::FastDivMod stride_h;
 
   explicit HOSTDEVICE FastDivModForPoolingWithMoreStaff(
-      const int channels, const int input_width, const int input_height,
-      const int ksize_width, const int ksize_height, const int stride_width,
+      const int channels,
+      const int input_width,
+      const int input_height,
+      const int ksize_width,
+      const int ksize_height,
+      const int stride_width,
       const int stride_height) {
-    channel = platform::FastDivMod(channels);
-    width = platform::FastDivMod(input_width);
-    height = platform::FastDivMod(input_height);
-    ksize_w = platform::FastDivMod(ksize_width);
-    ksize_h = platform::FastDivMod(ksize_height);
-    stride_w = platform::FastDivMod(stride_width);
-    stride_h = platform::FastDivMod(stride_height);
+    channel = paddle::platform::FastDivMod(channels);
+    width = paddle::platform::FastDivMod(input_width);
+    height = paddle::platform::FastDivMod(input_height);
+    ksize_w = paddle::platform::FastDivMod(ksize_width);
+    ksize_h = paddle::platform::FastDivMod(ksize_height);
+    stride_w = paddle::platform::FastDivMod(stride_width);
+    stride_h = paddle::platform::FastDivMod(stride_height);
   }
 };
 
 template <typename FastDivModForPooling>
-__device__ void OffsetPreparationFor4Dimension(
-    int index, bool channel_last, FastDivModForPooling divmods,
-    const int pad_width, const int pad_height, const int aux_width,
-    const int aux_height, int* w_offset, int* h_offset, int* c_offset,
-    int* stride) {
+__device__ void OffsetPreparationFor4Dimension(int index,
+                                               bool channel_last,
+                                               FastDivModForPooling divmods,
+                                               const int pad_width,
+                                               const int pad_height,
+                                               const int aux_width,
+                                               const int aux_height,
+                                               int* w_offset,
+                                               int* h_offset,
+                                               int* c_offset,
+                                               int* stride) {
   if (!channel_last) { /* NCHW */
     auto input_width_divmod = divmods.width.Divmod(index);
     auto input_height_divmod = divmods.height.Divmod(input_width_divmod.val[0]);
@@ -91,21 +100,40 @@ __device__ void OffsetPreparationFor4Dimension(
 }
 
 template <typename PoolProcess, typename T>
-__global__ void KernelPool2D(
-    const int nthreads, const T* input_data, const int channels,
-    const int input_height, const int input_width, const int output_height,
-    const int output_width, const int ksize_height, const int ksize_width,
-    const int stride_height, const int stride_width, const int padding_height,
-    const int padding_width, FastDivModForPooling divmods,
-    PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data,
-    bool channel_last = false) {
+__global__ void KernelPool2D(const int nthreads,
+                             const T* input_data,
+                             const int channels,
+                             const int input_height,
+                             const int input_width,
+                             const int output_height,
+                             const int output_width,
+                             const int ksize_height,
+                             const int ksize_width,
+                             const int stride_height,
+                             const int stride_width,
+                             const int padding_height,
+                             const int padding_width,
+                             FastDivModForPooling divmods,
+                             PoolProcess pool_process,
+                             bool exclusive,
+                             bool adaptive,
+                             T* output_data,
+                             bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int hstart, hend, wstart, wend;
     int w_offset, h_offset, c_offset, input_offset;
-    OffsetPreparationFor4Dimension<FastDivModForPooling>(
-        index, channel_last, divmods, 0, 0, input_width, input_height,
-        &w_offset, &h_offset, &c_offset, &input_offset);
+    OffsetPreparationFor4Dimension<FastDivModForPooling>(index,
+                                                         channel_last,
+                                                         divmods,
+                                                         0,
+                                                         0,
+                                                         input_width,
+                                                         input_height,
+                                                         &w_offset,
+                                                         &h_offset,
+                                                         &c_offset,
+                                                         &input_offset);
     input_data += input_offset;
 
     if (adaptive) {
@@ -139,25 +167,43 @@ __global__ void KernelPool2D(
 }
 
 template <typename T, typename PoolProcess>
-__global__ void KernelPool2DGrad(
-    const int nthreads, const T* __restrict__ input_data,
-    const T* __restrict__ output_data, const const T* __restrict__ output_grad,
-    const int output_width, const int output_height, const int input_width,
-    const int input_height, const int ksize_width, const int ksize_height,
-    const int stride_width, const int stride_height, const int padding_width,
-    const int padding_height, FastDivModForPoolingWithMoreStaff divmods,
-    PoolProcess pool_process, bool exclusive, bool adaptive,
-    T* __restrict__ input_grad, bool channel_last = false) {
+__global__ void KernelPool2DGrad(const int nthreads,
+                                 const T* __restrict__ input_data,
+                                 const T* __restrict__ output_data,
+                                 const const T* __restrict__ output_grad,
+                                 const int output_width,
+                                 const int output_height,
+                                 const int input_width,
+                                 const int input_height,
+                                 const int ksize_width,
+                                 const int ksize_height,
+                                 const int stride_width,
+                                 const int stride_height,
+                                 const int padding_width,
+                                 const int padding_height,
+                                 FastDivModForPoolingWithMoreStaff divmods,
+                                 PoolProcess pool_process,
+                                 bool exclusive,
+                                 bool adaptive,
+                                 T* __restrict__ input_grad,
+                                 bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     T input = static_cast<T>(0);
     T input_grad_data = static_cast<T>(0);
     int phstart, phend, pwstart, pwend;
     int w_offset, h_offset, c_offset, output_offset;
-    OffsetPreparationFor4Dimension<>(index, channel_last, divmods,
-                                     padding_width, padding_height,
-                                     output_width, output_height, &w_offset,
-                                     &h_offset, &c_offset, &output_offset);
+    OffsetPreparationFor4Dimension<>(index,
+                                     channel_last,
+                                     divmods,
+                                     padding_width,
+                                     padding_height,
+                                     output_width,
+                                     output_height,
+                                     &w_offset,
+                                     &h_offset,
+                                     &c_offset,
+                                     &output_offset);
     if (pool_process.use_x) {
       input = input_data[index];
       output_data += output_offset;
@@ -188,7 +234,9 @@ __global__ void KernelPool2DGrad(
                            : tmp_idx;
           T ouput_value = pool_process.use_x ? output_data[output_sub_idx]
                                              : static_cast<T>(0);
-          pool_process.compute(input, ouput_value, output_grad[output_sub_idx],
+          pool_process.compute(input,
+                               ouput_value,
+                               output_grad[output_sub_idx],
                                static_cast<T>(1.0 / pool_size),
                                &input_grad_data);
         }
@@ -217,9 +265,11 @@ __global__ void KernelPool2DGrad(
                              : tmp_idx;
             T ouput_value = pool_process.use_x ? output_data[output_sub_idx]
                                                : static_cast<T>(0);
-            pool_process.compute(
-                input, ouput_value, output_grad[output_sub_idx],
-                static_cast<T>(1.0 / pool_size), &input_grad_data);
+            pool_process.compute(input,
+                                 ouput_value,
+                                 output_grad[output_sub_idx],
+                                 static_cast<T>(1.0 / pool_size),
+                                 &input_grad_data);
           }
         }
       } else {
@@ -232,9 +282,11 @@ __global__ void KernelPool2DGrad(
                              : tmp_idx;
             T ouput_value = pool_process.use_x ? output_data[output_sub_idx]
                                                : static_cast<T>(0);
-            pool_process.compute(
-                input, ouput_value, output_grad[output_sub_idx],
-                static_cast<T>(1.0 / pool_size), &input_grad_data);
+            pool_process.compute(input,
+                                 ouput_value,
+                                 output_grad[output_sub_idx],
+                                 static_cast<T>(1.0 / pool_size),
+                                 &input_grad_data);
           }
         }
       }
@@ -244,19 +296,38 @@ __global__ void KernelPool2DGrad(
 }
 
 template <typename T>
-__global__ void KernelMaxPool2DGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, const int channels, const int input_height,
-    const int input_width, const int output_height, const int output_width,
-    const int ksize_height, const int ksize_width, const int stride_height,
-    const int stride_width, const int padding_height, const int padding_width,
-    T* input_grad, FastDivModForPooling divmods, bool channel_last = false) {
+__global__ void KernelMaxPool2DGrad(const int nthreads,
+                                    const T* input_data,
+                                    const T* output_data,
+                                    const T* output_grad,
+                                    const int channels,
+                                    const int input_height,
+                                    const int input_width,
+                                    const int output_height,
+                                    const int output_width,
+                                    const int ksize_height,
+                                    const int ksize_width,
+                                    const int stride_height,
+                                    const int stride_width,
+                                    const int padding_height,
+                                    const int padding_width,
+                                    T* input_grad,
+                                    FastDivModForPooling divmods,
+                                    bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int w_offset, h_offset, c_offset, input_offset;
-    OffsetPreparationFor4Dimension<FastDivModForPooling>(
-        index, channel_last, divmods, 0, 0, input_width, input_height,
-        &w_offset, &h_offset, &c_offset, &input_offset);
+    OffsetPreparationFor4Dimension<FastDivModForPooling>(index,
+                                                         channel_last,
+                                                         divmods,
+                                                         0,
+                                                         0,
+                                                         input_width,
+                                                         input_height,
+                                                         &w_offset,
+                                                         &h_offset,
+                                                         &c_offset,
+                                                         &input_offset);
     input_data += input_offset;
     input_grad += input_offset;
 
@@ -285,17 +356,24 @@ __global__ void KernelMaxPool2DGrad(
 
     if (maxIndex != -1) {
       // atomic add
-      platform::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]);
+      paddle::platform::CudaAtomicAdd(input_grad + maxIndex,
+                                      output_grad[index]);
     }
   }
 }
 
 template <typename PoolProcess, typename T>
 void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
-    const T* input, const std::vector<int>& input_shape,
-    const std::vector<int>& output_shape, const std::vector<int>& ksize,
-    const std::vector<int>& strides, const std::vector<int>& paddings,
-    bool exclusive, bool adaptive, T* output, gpuStream_t stream,
+    const T* input,
+    const std::vector<int>& input_shape,
+    const std::vector<int>& output_shape,
+    const std::vector<int>& ksize,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    bool exclusive,
+    bool adaptive,
+    T* output,
+    gpuStream_t stream,
     PoolProcess pool_compute) {
   const int batch_size = input_shape[0];
   const int input_channels = input_shape[1];
@@ -314,7 +392,7 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
   int nthreads = batch_size * output_channels * output_height * output_width;
   int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-  // platform::ChangeThreadNum(context, &thread_num);
+  // backends::gpu::ChangeThreadNum(context, &thread_num);
   thread_num = 512;
 #endif
   int blocks = (nthreads + thread_num - 1) / thread_num;
@@ -323,11 +401,24 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
 
   auto pool_divmods =
       FastDivModForPooling(input_channels, output_width, output_height);
-  KernelPool2D<PoolProcess, T><<<grid, threads, 0, stream>>>(
-      nthreads, input, input_channels, input_height, input_width, output_height,
-      output_width, ksize_height, ksize_width, stride_height, stride_width,
-      padding_height, padding_width, pool_divmods, pool_compute, exclusive,
-      adaptive, output);
+  KernelPool2D<PoolProcess, T><<<grid, threads, 0, stream>>>(nthreads,
+                                                             input,
+                                                             input_channels,
+                                                             input_height,
+                                                             input_width,
+                                                             output_height,
+                                                             output_width,
+                                                             ksize_height,
+                                                             ksize_width,
+                                                             stride_height,
+                                                             stride_width,
+                                                             padding_height,
+                                                             padding_width,
+                                                             pool_divmods,
+                                                             pool_compute,
+                                                             exclusive,
+                                                             adaptive,
+                                                             output);
 }
 
 /*
@@ -338,13 +429,16 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
  * height_down, width_left and width_right, respectively.
  */
 template <typename PoolProcess, typename T>
-class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
+class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* output,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
                   PoolProcess pool_process) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -361,12 +455,12 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const int padding_width = paddings[1];
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
@@ -375,17 +469,35 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
     KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_height, input_width,
-        output_height, output_width, ksize_height, ksize_width, stride_height,
-        stride_width, padding_height, padding_width, pool_divmods, pool_process,
-        exclusive, adaptive, output_data);
+        nthreads,
+        input_data,
+        input_channels,
+        input_height,
+        input_width,
+        output_height,
+        output_width,
+        ksize_height,
+        ksize_width,
+        stride_height,
+        stride_width,
+        padding_height,
+        padding_width,
+        pool_divmods,
+        pool_process,
+        exclusive,
+        adaptive,
+        output_data);
   }
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* output, PoolProcess pool_process) {
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_process) {
     bool channel_last = (data_format == "NHWC");
     const int batch_size = input.dims()[0];
 
@@ -410,12 +522,12 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const int padding_width = paddings[1];
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
@@ -424,10 +536,25 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
     KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_height, input_width,
-        output_height, output_width, ksize_height, ksize_width, stride_height,
-        stride_width, padding_height, padding_width, pool_divmods, pool_process,
-        exclusive, adaptive, output_data, channel_last);
+        nthreads,
+        input_data,
+        input_channels,
+        input_height,
+        input_width,
+        output_height,
+        output_width,
+        ksize_height,
+        ksize_width,
+        stride_height,
+        stride_width,
+        padding_height,
+        padding_width,
+        pool_divmods,
+        pool_process,
+        exclusive,
+        adaptive,
+        output_data,
+        channel_last);
   }
 };
 /*
@@ -438,16 +565,18 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  * height_down, width_left and width_right, respectively.
  */
 template <typename PoolProcess, typename T>
-class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
+class Pool2dGradFunctor<phi::GPUContext, PoolProcess, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* input_grad,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
                   PoolProcess pool_process) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -465,30 +594,53 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads = batch_size * input_channels * input_height * input_width;
-    auto pool_divmods = FastDivModForPoolingWithMoreStaff(
-        input_channels, input_width, input_height, ksize_width, ksize_height,
-        stride_width, stride_height);
-
-    auto config = GetGpuLaunchConfig1D(context, nthreads);
-    KernelPool2DGrad<T, PoolProcess><<<
-        config.block_per_grid, config.thread_per_block, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, output_width,
-        output_height, input_width, input_height, ksize_width, ksize_height,
-        stride_width, stride_height, padding_width, padding_height,
-        pool_divmods, pool_process, exclusive, adaptive, input_grad_data);
+    auto pool_divmods = FastDivModForPoolingWithMoreStaff(input_channels,
+                                                          input_width,
+                                                          input_height,
+                                                          ksize_width,
+                                                          ksize_height,
+                                                          stride_width,
+                                                          stride_height);
+
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(context, nthreads);
+    KernelPool2DGrad<T, PoolProcess><<<config.block_per_grid,
+                                       config.thread_per_block,
+                                       0,
+                                       context.stream()>>>(nthreads,
+                                                           input_data,
+                                                           output_data,
+                                                           output_grad_data,
+                                                           output_width,
+                                                           output_height,
+                                                           input_width,
+                                                           input_height,
+                                                           ksize_width,
+                                                           ksize_height,
+                                                           stride_width,
+                                                           stride_height,
+                                                           padding_width,
+                                                           padding_height,
+                                                           pool_divmods,
+                                                           pool_process,
+                                                           exclusive,
+                                                           adaptive,
+                                                           input_grad_data);
   }
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* input_grad, PoolProcess pool_process) {
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_process) {
     bool channel_last = (data_format == "NHWC");
 
     const int batch_size = input.dims()[0];
@@ -514,21 +666,41 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads = batch_size * input_channels * input_height * input_width;
-    auto pool_divmods = FastDivModForPoolingWithMoreStaff(
-        input_channels, input_width, input_height, ksize_width, ksize_height,
-        stride_width, stride_height);
-
-    auto config = GetGpuLaunchConfig1D(context, nthreads);
-    KernelPool2DGrad<T, PoolProcess><<<
-        config.block_per_grid, config.thread_per_block, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, output_width,
-        output_height, input_width, input_height, ksize_width, ksize_height,
-        stride_width, stride_height, padding_width, padding_height,
-        pool_divmods, pool_process, exclusive, adaptive, input_grad_data,
-        channel_last);
+    auto pool_divmods = FastDivModForPoolingWithMoreStaff(input_channels,
+                                                          input_width,
+                                                          input_height,
+                                                          ksize_width,
+                                                          ksize_height,
+                                                          stride_width,
+                                                          stride_height);
+
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(context, nthreads);
+    KernelPool2DGrad<T, PoolProcess><<<config.block_per_grid,
+                                       config.thread_per_block,
+                                       0,
+                                       context.stream()>>>(nthreads,
+                                                           input_data,
+                                                           output_data,
+                                                           output_grad_data,
+                                                           output_width,
+                                                           output_height,
+                                                           input_width,
+                                                           input_height,
+                                                           ksize_width,
+                                                           ksize_height,
+                                                           stride_width,
+                                                           stride_height,
+                                                           padding_width,
+                                                           padding_height,
+                                                           pool_divmods,
+                                                           pool_process,
+                                                           exclusive,
+                                                           adaptive,
+                                                           input_grad_data,
+                                                           channel_last);
   }
 };
 
@@ -540,16 +712,16 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  * height_down, width_left and width_right, respectively.
  */
 template <typename T>
-class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
+class MaxPool2dGradFunctor<phi::GPUContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  framework::Tensor* input_grad) {
+                  DenseTensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -567,7 +739,7 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -577,17 +749,33 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
     KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_height, input_width, output_height, output_width, ksize_height,
-        ksize_width, stride_height, stride_width, padding_height, padding_width,
-        input_grad_data, pool_divmods);
+        nthreads,
+        input_data,
+        output_data,
+        output_grad_data,
+        input_channels,
+        input_height,
+        input_width,
+        output_height,
+        output_width,
+        ksize_height,
+        ksize_width,
+        stride_height,
+        stride_width,
+        padding_height,
+        padding_width,
+        input_grad_data,
+        pool_divmods);
   }
-  void operator()(
-      const platform::CUDADeviceContext& context,
-      const framework::Tensor& input, const framework::Tensor& output,
-      const framework::Tensor& output_grad, const std::vector<int>& ksize,
-      const std::vector<int>& strides, const std::vector<int>& paddings,
-      const std::string data_format, framework::Tensor* input_grad) {
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  DenseTensor* input_grad) {
     bool channel_last = (data_format == "NHWC");
 
     const int batch_size = input.dims()[0];
@@ -614,7 +802,7 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -625,71 +813,80 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
         FastDivModForPooling(input_channels, output_width, output_height);
 
     KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_height, input_width, output_height, output_width, ksize_height,
-        ksize_width, stride_height, stride_width, padding_height, padding_width,
-        input_grad_data, pool_divmods, channel_last);
+        nthreads,
+        input_data,
+        output_data,
+        output_grad_data,
+        input_channels,
+        input_height,
+        input_width,
+        output_height,
+        output_width,
+        ksize_height,
+        ksize_width,
+        stride_height,
+        stride_width,
+        padding_height,
+        padding_width,
+        input_grad_data,
+        pool_divmods,
+        channel_last);
   }
 };
 
-template class Pool2dDirectCUDAFunctor<paddle::operators::math::MaxPool<float>,
-                                       float>;
-template class Pool2dDirectCUDAFunctor<paddle::operators::math::AvgPool<float>,
-                                       float>;
-
-template class MaxPool2dGradFunctor<platform::CUDADeviceContext, float>;
-template class MaxPool2dGradFunctor<platform::CUDADeviceContext, double>;
-template class MaxPool2dGradFunctor<platform::CUDADeviceContext,
-                                    paddle::platform::float16>;
-
-template class Pool2dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool2dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::AvgPool<float>, float>;
-template class Pool2dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool2dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool2dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool2dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::AvgPool<double>, double>;
-template class Pool2dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool2dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<double>,
-                                 double>;
-
-template class Pool2dFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::MaxPool<paddle::platform::float16>,
-    paddle::platform::float16>;
-template class Pool2dFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::AvgPool<paddle::platform::float16>,
-    paddle::platform::float16>;
-template class Pool2dGradFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::MaxPoolGrad<paddle::platform::float16>,
-    paddle::platform::float16>;
-template class Pool2dGradFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::AvgPoolGrad<paddle::platform::float16>,
-    paddle::platform::float16>;
+template class Pool2dDirectCUDAFunctor<MaxPool<float>, float>;
+template class Pool2dDirectCUDAFunctor<AvgPool<float>, float>;
+
+template class MaxPool2dGradFunctor<phi::GPUContext, float>;
+template class MaxPool2dGradFunctor<phi::GPUContext, double>;
+template class MaxPool2dGradFunctor<phi::GPUContext, dtype::float16>;
+
+template class Pool2dFunctor<phi::GPUContext, MaxPool<float>, float>;
+template class Pool2dFunctor<phi::GPUContext, AvgPool<float>, float>;
+template class Pool2dGradFunctor<phi::GPUContext, MaxPoolGrad<float>, float>;
+template class Pool2dGradFunctor<phi::GPUContext, AvgPoolGrad<float>, float>;
+template class Pool2dFunctor<phi::GPUContext, MaxPool<double>, double>;
+template class Pool2dFunctor<phi::GPUContext, AvgPool<double>, double>;
+template class Pool2dGradFunctor<phi::GPUContext, MaxPoolGrad<double>, double>;
+template class Pool2dGradFunctor<phi::GPUContext, AvgPoolGrad<double>, double>;
+
+template class Pool2dFunctor<phi::GPUContext,
+                             MaxPool<dtype::float16>,
+                             dtype::float16>;
+template class Pool2dFunctor<phi::GPUContext,
+                             AvgPool<dtype::float16>,
+                             dtype::float16>;
+template class Pool2dGradFunctor<phi::GPUContext,
+                                 MaxPoolGrad<dtype::float16>,
+                                 dtype::float16>;
+template class Pool2dGradFunctor<phi::GPUContext,
+                                 AvgPoolGrad<dtype::float16>,
+                                 dtype::float16>;
 
 template <typename PoolProcess, typename T>
-__global__ void KernelPool3D(
-    const int nthreads, const T* input_data, const int channels,
-    const int input_depth, const int input_height, const int input_width,
-    const int output_depth, const int output_height, const int output_width,
-    const int ksize_depth, const int ksize_height, const int ksize_width,
-    const int stride_depth, const int stride_height, const int stride_width,
-    const int padding_depth, const int padding_height, const int padding_width,
-    PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data,
-    bool channel_last = false) {
+__global__ void KernelPool3D(const int nthreads,
+                             const T* input_data,
+                             const int channels,
+                             const int input_depth,
+                             const int input_height,
+                             const int input_width,
+                             const int output_depth,
+                             const int output_height,
+                             const int output_width,
+                             const int ksize_depth,
+                             const int ksize_height,
+                             const int ksize_width,
+                             const int stride_depth,
+                             const int stride_height,
+                             const int stride_width,
+                             const int padding_depth,
+                             const int padding_height,
+                             const int padding_width,
+                             PoolProcess pool_process,
+                             bool exclusive,
+                             bool adaptive,
+                             T* output_data,
+                             bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw, ph, pd, c, batch_idx;
@@ -764,16 +961,31 @@ __global__ void KernelPool3D(
 }
 
 template <typename T, typename PoolProcess>
-__global__ void KernelPool3DGrad(
-    const int nthreads, const T* __restrict__ input_data,
-    const T* __restrict__ output_data, const T* __restrict__ output_grad,
-    const int channels, const int input_depth, const int input_height,
-    const int input_width, const int output_depth, const int output_height,
-    const int output_width, const int ksize_depth, const int ksize_height,
-    const int ksize_width, const int stride_depth, const int stride_height,
-    const int stride_width, const int padding_depth, const int padding_height,
-    const int padding_width, PoolProcess pool_process, bool exclusive,
-    bool adaptive, T* input_grad, bool channel_last = false) {
+__global__ void KernelPool3DGrad(const int nthreads,
+                                 const T* __restrict__ input_data,
+                                 const T* __restrict__ output_data,
+                                 const T* __restrict__ output_grad,
+                                 const int channels,
+                                 const int input_depth,
+                                 const int input_height,
+                                 const int input_width,
+                                 const int output_depth,
+                                 const int output_height,
+                                 const int output_width,
+                                 const int ksize_depth,
+                                 const int ksize_height,
+                                 const int ksize_width,
+                                 const int stride_depth,
+                                 const int stride_height,
+                                 const int stride_width,
+                                 const int padding_depth,
+                                 const int padding_height,
+                                 const int padding_width,
+                                 PoolProcess pool_process,
+                                 bool exclusive,
+                                 bool adaptive,
+                                 T* input_grad,
+                                 bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int w_offset, h_offset, d_offset, c_offset, batch_idx, output_stride;
@@ -867,7 +1079,9 @@ __global__ void KernelPool3DGrad(
                   : (pd * output_height + ph) * output_width + pw;
           T ouput_value = pool_process.use_x ? output_data[output_sub_idx]
                                              : static_cast<T>(0);
-          pool_process.compute(input, ouput_value, output_grad[output_sub_idx],
+          pool_process.compute(input,
+                               ouput_value,
+                               output_grad[output_sub_idx],
                                static_cast<T>(1.0 / pool_size),
                                &input_grad_data);
         }
@@ -878,15 +1092,28 @@ __global__ void KernelPool3DGrad(
 }
 
 template <typename T>
-__global__ void KernelMaxPool3DGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, const int channels, const int input_depth,
-    const int input_height, const int input_width, const int output_depth,
-    const int output_height, const int output_width, const int ksize_depth,
-    const int ksize_height, const int ksize_width, const int stride_depth,
-    const int stride_height, const int stride_width, const int padding_depth,
-    const int padding_height, const int padding_width, T* input_grad,
-    bool channel_last = false) {
+__global__ void KernelMaxPool3DGrad(const int nthreads,
+                                    const T* input_data,
+                                    const T* output_data,
+                                    const T* output_grad,
+                                    const int channels,
+                                    const int input_depth,
+                                    const int input_height,
+                                    const int input_width,
+                                    const int output_depth,
+                                    const int output_height,
+                                    const int output_width,
+                                    const int ksize_depth,
+                                    const int ksize_height,
+                                    const int ksize_width,
+                                    const int stride_depth,
+                                    const int stride_height,
+                                    const int stride_width,
+                                    const int padding_depth,
+                                    const int padding_height,
+                                    const int padding_width,
+                                    T* input_grad,
+                                    bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw, ph, pd, c, batch_idx;
@@ -949,17 +1176,23 @@ __global__ void KernelMaxPool3DGrad(
     }
     if (maxIdx != -1) {
       // atomic add
-      platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]);
+      paddle::platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]);
     }
   }
 }
 
 template <typename PoolProcess, typename T>
 void Pool3dDirectCUDAFunctor<PoolProcess, T>::operator()(
-    const T* input, const std::vector<int>& input_shape,
-    const std::vector<int>& output_shape, const std::vector<int>& ksize,
-    const std::vector<int>& strides, const std::vector<int>& paddings,
-    bool exclusive, bool adaptive, T* output, gpuStream_t stream,
+    const T* input,
+    const std::vector<int>& input_shape,
+    const std::vector<int>& output_shape,
+    const std::vector<int>& ksize,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    bool exclusive,
+    bool adaptive,
+    T* output,
+    gpuStream_t stream,
     PoolProcess pool_compute) {
   const int batch_size = input_shape[0];
   const int input_channels = input_shape[1];
@@ -990,11 +1223,28 @@ void Pool3dDirectCUDAFunctor<PoolProcess, T>::operator()(
   dim3 threads(thread_num, 1);
   dim3 grid(blocks, 1);
 
-  KernelPool3D<PoolProcess, T><<<grid, threads, 0, stream>>>(
-      nthreads, input, input_channels, input_depth, input_height, input_width,
-      output_depth, output_height, output_width, ksize_depth, ksize_height,
-      ksize_width, stride_depth, stride_height, stride_width, padding_depth,
-      padding_height, padding_width, pool_compute, exclusive, adaptive, output);
+  KernelPool3D<PoolProcess, T><<<grid, threads, 0, stream>>>(nthreads,
+                                                             input,
+                                                             input_channels,
+                                                             input_depth,
+                                                             input_height,
+                                                             input_width,
+                                                             output_depth,
+                                                             output_height,
+                                                             output_width,
+                                                             ksize_depth,
+                                                             ksize_height,
+                                                             ksize_width,
+                                                             stride_depth,
+                                                             stride_height,
+                                                             stride_width,
+                                                             padding_depth,
+                                                             padding_height,
+                                                             padding_width,
+                                                             pool_compute,
+                                                             exclusive,
+                                                             adaptive,
+                                                             output);
 }
 
 /*
@@ -1006,13 +1256,16 @@ void Pool3dDirectCUDAFunctor<PoolProcess, T>::operator()(
  * height_up, height_down, width_left and width_right, respectively.
  */
 template <typename PoolProcess, class T>
-class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
+class Pool3dFunctor<phi::GPUContext, PoolProcess, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* output,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
                   PoolProcess pool_process) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -1034,31 +1287,52 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const int padding_width = paddings[2];
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
     dim3 grid(blocks, 1);
 
     KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_depth, input_height,
-        input_width, output_depth, output_height, output_width, ksize_depth,
-        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
-        padding_depth, padding_height, padding_width, pool_process, exclusive,
-        adaptive, output_data);
+        nthreads,
+        input_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        pool_process,
+        exclusive,
+        adaptive,
+        output_data);
   }
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* output, PoolProcess pool_process) {
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_process) {
     bool channel_last = (data_format == "NDHWC");
     const int batch_size = input.dims()[0];
 
@@ -1089,24 +1363,42 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const int padding_width = paddings[2];
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
     dim3 grid(blocks, 1);
 
     KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_depth, input_height,
-        input_width, output_depth, output_height, output_width, ksize_depth,
-        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
-        padding_depth, padding_height, padding_width, pool_process, exclusive,
-        adaptive, output_data, channel_last);
+        nthreads,
+        input_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        pool_process,
+        exclusive,
+        adaptive,
+        output_data,
+        channel_last);
   }
 };
 
@@ -1119,16 +1411,18 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  * height_up, height_down, width_left and width_right, respectively.
  */
 template <typename PoolProcess, class T>
-class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
+class Pool3dGradFunctor<phi::GPUContext, PoolProcess, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* input_grad,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
                   PoolProcess pool_process) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -1152,7 +1446,7 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads =
         batch_size * input_channels * input_depth * input_height * input_width;
@@ -1161,21 +1455,43 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     dim3 grid(blocks, 1);
 
     KernelPool3DGrad<T, PoolProcess><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width, pool_process, exclusive, adaptive, input_grad_data);
+        nthreads,
+        input_data,
+        output_data,
+        output_grad_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        pool_process,
+        exclusive,
+        adaptive,
+        input_grad_data);
   }
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* input_grad, PoolProcess pool_process) {
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_process) {
     bool channel_last = (data_format == "NDHWC");
 
     const int batch_size = input.dims()[0];
@@ -1206,7 +1522,7 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads =
         batch_size * input_channels * input_depth * input_height * input_width;
@@ -1215,11 +1531,30 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     dim3 grid(blocks, 1);
 
     KernelPool3DGrad<T, PoolProcess><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width, pool_process, exclusive, adaptive, input_grad_data,
+        nthreads,
+        input_data,
+        output_data,
+        output_grad_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        pool_process,
+        exclusive,
+        adaptive,
+        input_grad_data,
         channel_last);  // add channel_last
   }
 };
@@ -1233,16 +1568,16 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  * height_up, height_down, width_left and width_right, respectively.
  */
 template <class T>
-class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
+class MaxPool3dGradFunctor<phi::GPUContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  framework::Tensor* input_grad) {
+                  DenseTensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -1265,7 +1600,7 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
@@ -1274,18 +1609,37 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
     dim3 grid(blocks, 1);
 
     KernelMaxPool3DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width, input_grad_data);
+        nthreads,
+        input_data,
+        output_data,
+        output_grad_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        input_grad_data);
   }
-  void operator()(
-      const platform::CUDADeviceContext& context,
-      const framework::Tensor& input, const framework::Tensor& output,
-      const framework::Tensor& output_grad, const std::vector<int>& ksize,
-      const std::vector<int>& strides, const std::vector<int>& paddings,
-      const std::string data_format, framework::Tensor* input_grad) {
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  DenseTensor* input_grad) {
     bool channel_last = (data_format == "NDHWC");
     const int batch_size = input.dims()[0];
 
@@ -1316,7 +1670,7 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
@@ -1325,77 +1679,93 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
     dim3 grid(blocks, 1);
 
     KernelMaxPool3DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width, input_grad_data, channel_last);  // add channel_last
+        nthreads,
+        input_data,
+        output_data,
+        output_grad_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        input_grad_data,
+        channel_last);  // add channel_last
   }
 };
 
-template class Pool3dDirectCUDAFunctor<paddle::operators::math::MaxPool<float>,
-                                       float>;
-template class Pool3dDirectCUDAFunctor<paddle::operators::math::AvgPool<float>,
-                                       float>;
-
-template class MaxPool3dGradFunctor<platform::CUDADeviceContext, float>;
-template class MaxPool3dGradFunctor<platform::CUDADeviceContext, double>;
-template class MaxPool3dGradFunctor<platform::CUDADeviceContext,
-                                    paddle::platform::float16>;
-
-template class Pool3dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool3dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::AvgPool<float>, float>;
-template class Pool3dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool3dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool3dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool3dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::AvgPool<double>, double>;
-template class Pool3dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool3dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<double>,
-                                 double>;
-
-template class Pool3dFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::MaxPool<paddle::platform::float16>,
-    paddle::platform::float16>;
-template class Pool3dFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::AvgPool<paddle::platform::float16>,
-    paddle::platform::float16>;
-template class Pool3dGradFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::MaxPoolGrad<paddle::platform::float16>,
-    paddle::platform::float16>;
-template class Pool3dGradFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::AvgPoolGrad<paddle::platform::float16>,
-    paddle::platform::float16>;
+template class Pool3dDirectCUDAFunctor<MaxPool<float>, float>;
+template class Pool3dDirectCUDAFunctor<AvgPool<float>, float>;
+
+template class MaxPool3dGradFunctor<phi::GPUContext, float>;
+template class MaxPool3dGradFunctor<phi::GPUContext, double>;
+template class MaxPool3dGradFunctor<phi::GPUContext, dtype::float16>;
+
+template class Pool3dFunctor<phi::GPUContext, MaxPool<float>, float>;
+template class Pool3dFunctor<phi::GPUContext, AvgPool<float>, float>;
+template class Pool3dGradFunctor<phi::GPUContext, MaxPoolGrad<float>, float>;
+template class Pool3dGradFunctor<phi::GPUContext, AvgPoolGrad<float>, float>;
+template class Pool3dFunctor<phi::GPUContext, MaxPool<double>, double>;
+template class Pool3dFunctor<phi::GPUContext, AvgPool<double>, double>;
+template class Pool3dGradFunctor<phi::GPUContext, MaxPoolGrad<double>, double>;
+template class Pool3dGradFunctor<phi::GPUContext, AvgPoolGrad<double>, double>;
+
+template class Pool3dFunctor<phi::GPUContext,
+                             MaxPool<dtype::float16>,
+                             dtype::float16>;
+template class Pool3dFunctor<phi::GPUContext,
+                             AvgPool<dtype::float16>,
+                             dtype::float16>;
+template class Pool3dGradFunctor<phi::GPUContext,
+                                 MaxPoolGrad<dtype::float16>,
+                                 dtype::float16>;
+template class Pool3dGradFunctor<phi::GPUContext,
+                                 AvgPoolGrad<dtype::float16>,
+                                 dtype::float16>;
 
 template <typename T1, typename T2>
-__global__ void KernelMaxPool2dWithIdx(
-    const int nthreads, const T1* input_data, const int channels,
-    const int input_height, const int input_width, const int output_height,
-    const int output_width, const int ksize_height, const int ksize_width,
-    const int stride_height, const int stride_width, const int padding_height,
-    const int padding_width, bool adaptive, T1* output_data, T2* mask_data,
-    FastDivModForPooling divmods) {
+__global__ void KernelMaxPool2dWithIdx(const int nthreads,
+                                       const T1* input_data,
+                                       const int channels,
+                                       const int input_height,
+                                       const int input_width,
+                                       const int output_height,
+                                       const int output_width,
+                                       const int ksize_height,
+                                       const int ksize_width,
+                                       const int stride_height,
+                                       const int stride_width,
+                                       const int padding_height,
+                                       const int padding_width,
+                                       bool adaptive,
+                                       T1* output_data,
+                                       T2* mask_data,
+                                       FastDivModForPooling divmods) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int hstart, hend, wstart, wend;
     int w_offset, h_offset, c_offset, input_offset;
-    OffsetPreparationFor4Dimension<FastDivModForPooling>(
-        index, false, divmods, 0, 0, input_width, input_height, &w_offset,
-        &h_offset, &c_offset, &input_offset);
+    OffsetPreparationFor4Dimension<FastDivModForPooling>(index,
+                                                         false,
+                                                         divmods,
+                                                         0,
+                                                         0,
+                                                         input_width,
+                                                         input_height,
+                                                         &w_offset,
+                                                         &h_offset,
+                                                         &c_offset,
+                                                         &input_offset);
     input_data += input_offset;
 
     if (adaptive) {
@@ -1431,20 +1801,38 @@ __global__ void KernelMaxPool2dWithIdx(
 }
 
 template <typename T1, typename T2>
-__global__ void KernelMaxPool2DWithIdxGrad(
-    const int nthreads, const T1* output_grad, const T2* mask_data,
-    const int channels, const int input_height, const int input_width,
-    const int output_height, const int output_width, const int ksize_height,
-    const int ksize_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, bool adaptive,
-    T1* input_grad, FastDivModForPooling divmods) {
+__global__ void KernelMaxPool2DWithIdxGrad(const int nthreads,
+                                           const T1* output_grad,
+                                           const T2* mask_data,
+                                           const int channels,
+                                           const int input_height,
+                                           const int input_width,
+                                           const int output_height,
+                                           const int output_width,
+                                           const int ksize_height,
+                                           const int ksize_width,
+                                           const int stride_height,
+                                           const int stride_width,
+                                           const int padding_height,
+                                           const int padding_width,
+                                           bool adaptive,
+                                           T1* input_grad,
+                                           FastDivModForPooling divmods) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int phstart, phend, pwstart, pwend;
     int w_offset, h_offset, c_offset, output_offset;
-    OffsetPreparationFor4Dimension<FastDivModForPooling>(
-        index, false, divmods, 0, 0, output_width, output_height, &w_offset,
-        &h_offset, &c_offset, &output_offset);
+    OffsetPreparationFor4Dimension<FastDivModForPooling>(index,
+                                                         false,
+                                                         divmods,
+                                                         0,
+                                                         0,
+                                                         output_width,
+                                                         output_height,
+                                                         &w_offset,
+                                                         &h_offset,
+                                                         &c_offset,
+                                                         &output_offset);
     mask_data += output_offset;
     output_grad += output_offset;
 
@@ -1487,13 +1875,16 @@ __global__ void KernelMaxPool2DWithIdxGrad(
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
+class MaxPool2dWithIndexFunctor<phi::GPUContext, T1, T2> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* output,
+                  DenseTensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -1509,13 +1900,13 @@ class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
     const int padding_width = paddings[1];
 
     const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(context.GetPlace());
-    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
+    T1* output_data = context.template Alloc<T1>(output);
+    T2* mask_data = context.template Alloc<T2>(mask);
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
 
     int blocks = (nthreads + thread_num - 1) / thread_num;
@@ -1525,10 +1916,23 @@ class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
     KernelMaxPool2dWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_height, input_width,
-        output_height, output_width, ksize_height, ksize_width, stride_height,
-        stride_width, padding_height, padding_width, adaptive, output_data,
-        mask_data, pool_divmods);
+        nthreads,
+        input_data,
+        input_channels,
+        input_height,
+        input_width,
+        output_height,
+        output_width,
+        ksize_height,
+        ksize_width,
+        stride_height,
+        stride_width,
+        padding_height,
+        padding_width,
+        adaptive,
+        output_data,
+        mask_data,
+        pool_divmods);
   }
 };
 
@@ -1538,14 +1942,16 @@ class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
+class MaxPool2dWithIndexGradFunctor<phi::GPUContext, T1, T2> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& output_grad,
+                  const DenseTensor& mask,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_channels = input_grad->dims()[1];
     const int input_height = input_grad->dims()[2];
@@ -1561,7 +1967,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
 
     const T2* mask_data = mask.data<T2>();
     const T1* output_grad_data = output_grad.data<T1>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
+    T1* input_grad_data = context.template Alloc<T1>(input_grad);
 
     int nthreads = batch_size * input_channels * input_height * input_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -1571,31 +1977,53 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
     auto pool_divmods =
         FastDivModForPooling(input_channels, input_width, input_height);
     KernelMaxPool2DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads, output_grad_data, mask_data, input_channels, input_height,
-        input_width, output_height, output_width, ksize_height, ksize_width,
-        stride_height, stride_width, padding_height, padding_width, adaptive,
-        input_grad_data, pool_divmods);
+        nthreads,
+        output_grad_data,
+        mask_data,
+        input_channels,
+        input_height,
+        input_width,
+        output_height,
+        output_width,
+        ksize_height,
+        ksize_width,
+        stride_height,
+        stride_width,
+        padding_height,
+        padding_width,
+        adaptive,
+        input_grad_data,
+        pool_divmods);
   }
 };
 
-template class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, float,
-                                         int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, float,
-                                             int>;
-template class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, double,
-                                         int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext,
-                                             double, int>;
+template class MaxPool2dWithIndexFunctor<phi::GPUContext, float, int>;
+template class MaxPool2dWithIndexGradFunctor<phi::GPUContext, float, int>;
+template class MaxPool2dWithIndexFunctor<phi::GPUContext, double, int>;
+template class MaxPool2dWithIndexGradFunctor<phi::GPUContext, double, int>;
 
 template <typename T1, typename T2>
-__global__ void KernelMaxPool3DWithIdx(
-    const int nthreads, const T1* input_data, const int channels,
-    const int input_depth, const int input_height, const int input_width,
-    const int output_depth, const int output_height, const int output_width,
-    const int ksize_depth, const int ksize_height, const int ksize_width,
-    const int stride_depth, const int stride_height, const int stride_width,
-    const int padding_depth, const int padding_height, const int padding_width,
-    bool adaptive, T1* output_data, T2* mask_data) {
+__global__ void KernelMaxPool3DWithIdx(const int nthreads,
+                                       const T1* input_data,
+                                       const int channels,
+                                       const int input_depth,
+                                       const int input_height,
+                                       const int input_width,
+                                       const int output_depth,
+                                       const int output_height,
+                                       const int output_width,
+                                       const int ksize_depth,
+                                       const int ksize_height,
+                                       const int ksize_width,
+                                       const int stride_depth,
+                                       const int stride_height,
+                                       const int stride_width,
+                                       const int padding_depth,
+                                       const int padding_height,
+                                       const int padding_width,
+                                       bool adaptive,
+                                       T1* output_data,
+                                       T2* mask_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -1650,14 +2078,27 @@ __global__ void KernelMaxPool3DWithIdx(
 }
 
 template <typename T1, typename T2>
-__global__ void KernelMaxPool3DWithIdxGrad(
-    const int nthreads, const T1* output_grad, const T2* mask,
-    const int channels, const int input_depth, const int input_height,
-    const int input_width, const int output_depth, const int output_height,
-    const int output_width, const int ksize_depth, const int ksize_height,
-    const int ksize_width, const int stride_depth, const int stride_height,
-    const int stride_width, const int padding_depth, const int padding_height,
-    const int padding_width, bool adaptive, T1* input_grad) {
+__global__ void KernelMaxPool3DWithIdxGrad(const int nthreads,
+                                           const T1* output_grad,
+                                           const T2* mask,
+                                           const int channels,
+                                           const int input_depth,
+                                           const int input_height,
+                                           const int input_width,
+                                           const int output_depth,
+                                           const int output_height,
+                                           const int output_width,
+                                           const int ksize_depth,
+                                           const int ksize_height,
+                                           const int ksize_width,
+                                           const int stride_depth,
+                                           const int stride_height,
+                                           const int stride_width,
+                                           const int padding_depth,
+                                           const int padding_height,
+                                           const int padding_width,
+                                           bool adaptive,
+                                           T1* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int w_offset = index % input_width;
@@ -1727,13 +2168,16 @@ __global__ void KernelMaxPool3DWithIdxGrad(
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
+class MaxPool3dWithIndexFunctor<phi::GPUContext, T1, T2> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* output,
+                  DenseTensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -1754,14 +2198,14 @@ class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
     const int padding_width = paddings[2];
 
     const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(context.GetPlace());
-    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
+    T1* output_data = context.template Alloc<T1>(output);
+    T2* mask_data = context.template Alloc<T2>(mask);
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
 
     int blocks = (nthreads + thread_num - 1) / thread_num;
@@ -1769,10 +2213,26 @@ class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
     dim3 grid(blocks, 1);
 
     KernelMaxPool3DWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_depth, input_height,
-        input_width, output_depth, output_height, output_width, ksize_depth,
-        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
-        padding_depth, padding_height, padding_width, adaptive, output_data,
+        nthreads,
+        input_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        adaptive,
+        output_data,
         mask_data);
   }
 };
@@ -1783,14 +2243,16 @@ class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
+class MaxPool3dWithIndexGradFunctor<phi::GPUContext, T1, T2> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& output_grad,
+                  const DenseTensor& mask,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_channels = input_grad->dims()[1];
     const int input_depth = input_grad->dims()[2];
@@ -1811,7 +2273,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
 
     const T1* output_grad_data = output_grad.data<T1>();
     const T2* mask_data = mask.data<T2>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
+    T1* input_grad_data = context.template Alloc<T1>(input_grad);
 
     int nthreads =
         batch_size * input_channels * input_depth * input_height * input_width;
@@ -1820,23 +2282,34 @@ class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
     dim3 grid(blocks, 1);
 
     KernelMaxPool3DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads, output_grad_data, mask_data, input_channels, input_depth,
-        input_height, input_width, output_depth, output_height, output_width,
-        ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
-        stride_width, padding_depth, padding_height, padding_width, adaptive,
+        nthreads,
+        output_grad_data,
+        mask_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        adaptive,
         input_grad_data);
   }
 };
 
-template class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, float,
-                                         int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, float,
-                                             int>;
-template class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, double,
-                                         int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext,
-                                             double, int>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+template class MaxPool3dWithIndexFunctor<phi::GPUContext, float, int>;
+template class MaxPool3dWithIndexGradFunctor<phi::GPUContext, float, int>;
+template class MaxPool3dWithIndexFunctor<phi::GPUContext, double, int>;
+template class MaxPool3dWithIndexGradFunctor<phi::GPUContext, double, int>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h
new file mode 100644
index 0000000000000..fa285dc69d1ca
--- /dev/null
+++ b/paddle/phi/kernels/funcs/pooling.h
@@ -0,0 +1,469 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/macros.h"  // import FLT_MAX
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#endif
+
+namespace phi {
+namespace funcs {
+
+/*
+ * \brief Extracting simple operations from pooling.
+ *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
+ * operation.
+ *        MaxPool initializes temp variable to the negative maximum to find the
+ * maximum value in the pooling field.
+ *        AvgPool initializes temp variable to the zero to accumulate all values
+ * in pool pooling, and finally takes the average.
+ *        MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
+ */
+template <class T>
+class MaxPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
+  HOSTDEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
+  DEVICE inline void finalize(const T& pool_field, T* y) {}
+};
+
+template <class T>
+class AvgPool {
+  using MT = typename dtype::MPTypeTrait<T>::Type;
+  MT intermediate_res;
+
+ public:
+  DEVICE inline T initial() {
+    intermediate_res = static_cast<MT>(0.0f);
+    return static_cast<T>(0);
+  }
+
+  DEVICE inline void compute(const T& x, T* y) {
+    intermediate_res += static_cast<MT>(x);
+  }
+
+  DEVICE inline void finalize(const T& pool_field, T* y) {
+    *y = static_cast<T>(intermediate_res / (static_cast<MT>(pool_field)));
+  }
+};
+
+template <class T>
+class MaxPoolGrad {
+ public:
+  static constexpr bool use_x = true;
+  HOSTDEVICE inline void compute(
+      const T& x, const T& y, const T& dy, T scale, T* dx) {
+    *dx += dy * static_cast<T>(x == y);
+  }
+};
+
+template <class T>
+class AvgPoolGrad {
+ public:
+  static constexpr bool use_x = false;
+  HOSTDEVICE inline void compute(
+      const T& x, const T& y, const T& dy, T scale, T* dx) {
+    *dx += (scale * dy);
+  }
+};
+
+/* used for adaptive pool to calculate start and end index of each divided grid
+ */
+HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      floor(static_cast<double>(ph * input_size) / output_size));
+}
+
+HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      ceil(static_cast<double>((ph + 1) * input_size) / output_size));
+}
+
+/*
+ * \brief Getting pooling results, and calculating gradient.
+ *
+ * In pool2d, all Tensors are in NCHW or NHWC format. Where N is batch size, C
+ * is the number of channels, H and W is the height and width of feature.
+ * In pool3d, all Tensors are in NCDHW or NDHWC format. Where N is batch size, C
+ * is the number of channels, D, H and W is the depth, height and width of
+ * feature.
+ *
+ * In max pooling, it is possible that the pooling region has multiple maximum
+ * elements. In this case, we should compute the gradient of the first maximum
+ * element.
+ * This is different from average pooling. So we rewrite the max_pool_grad:
+ * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
+ */
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename PoolProcess, typename T>
+class Pool2dDirectCUDAFunctor {
+ public:
+  void operator()(const T* input,
+                  const std::vector<int>& input_shape,
+                  const std::vector<int>& output_shape,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  T* output,
+                  gpuStream_t stream,
+                  PoolProcess pool_compute);
+};
+#endif
+
+template <typename Context, typename PoolProcess, typename T>
+class Pool2dFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_compute);
+
+  // overload operator() to support argument data_format
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_compute);
+};
+
+template <typename Context, typename PoolProcess, typename T>
+class Pool2dGradFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_compute);
+  // overload operator() to support argument data_format
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_compute);
+};
+
+template <typename Context, class T>
+class MaxPool2dGradFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  DenseTensor* input_grad);
+  // overload operator() to support argument data_format
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  DenseTensor* input_grad);
+};
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename PoolProcess, typename T>
+class Pool3dDirectCUDAFunctor {
+ public:
+  void operator()(const T* input,
+                  const std::vector<int>& input_shape,
+                  const std::vector<int>& output_shape,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  T* output,
+                  gpuStream_t stream,
+                  PoolProcess pool_compute);
+};
+#endif
+
+template <typename Context, typename PoolProcess, typename T>
+class Pool3dFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_compute);
+  // overload operator() to support argument data_format
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_compute);
+};
+
+template <typename Context, typename PoolProcess, typename T>
+class Pool3dGradFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_compute);
+  // overload operator() to support argument data_format
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_compute);
+};
+
+template <typename Context, class T>
+class MaxPool3dGradFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  DenseTensor* input_grad);
+  // overload operator() to support argument data_format
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  DenseTensor* input_grad);
+};
+
+/*
+ * \brief Getting max pooling results and corresponding max index, and
+ * calculating gradient.
+ * In up-sampling-pooling, it is necessary to know max element index.
+ * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in
+ * NCDHW format.
+ */
+template <typename Context, typename T1, typename T2>
+class MaxPool2dWithIndexFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* output,
+                  DenseTensor* mask);
+};
+
+template <typename Context, typename T1, typename T2>
+class MaxPool2dWithIndexGradFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& output_grad,
+                  const DenseTensor& mask,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* input_grad);
+};
+
+template <typename Context, typename T1, typename T2>
+class MaxPool3dWithIndexFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* output,
+                  DenseTensor* mask);
+};
+
+template <typename Context, typename T1, typename T2>
+class MaxPool3dWithIndexGradFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& output_grad,
+                  const DenseTensor& mask,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* input_grad);
+};
+
+inline int PoolOutputSize(int input_size,
+                          int filter_size,
+                          int padding_1,
+                          int padding_2,
+                          int stride,
+                          bool ceil_mode) {
+  int output_size;
+  if (!ceil_mode) {
+    output_size =
+        (input_size - filter_size + padding_1 + padding_2) / stride + 1;
+  } else {
+    output_size =
+        (input_size - filter_size + padding_1 + padding_2 + stride - 1) /
+            stride +
+        1;
+  }
+  PADDLE_ENFORCE_GT(
+      output_size,
+      0,
+      errors::InvalidArgument(
+          "the output size must be greater than 0. But received: "
+          "output_size = %d due to the settings of input_size(%d), "
+          "padding(%d,%d), "
+          "k_size(%d) and stride(%d). Please check again!",
+          output_size,
+          input_size,
+          padding_1,
+          padding_2,
+          filter_size,
+          stride));
+  return output_size;
+}
+
+inline int MaxPoolOutputSize(int input_size,
+                             int filter_size,
+                             int padding,
+                             int stride) {
+  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  return output_size;
+}
+
+template <typename T = int>
+inline void UpdatePadding(std::vector<T>* paddings,
+                          const bool global_pooling,
+                          const bool adaptive,
+                          const std::string padding_algorithm,
+                          const DDim data_dims,
+                          const std::vector<T>& strides,
+                          const std::vector<T>& kernel_size) {
+  // set padding size == data_dims.size() * 2
+  auto data_shape = vectorize<T>(data_dims);
+  if (static_cast<int>(paddings->size()) == data_dims.size()) {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      T copy_pad = *(paddings->begin() + 2 * i);
+      paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
+    }
+  } else {
+    PADDLE_ENFORCE_EQ(data_dims.size() * 2,
+                      paddings->size(),
+                      errors::InvalidArgument(
+                          "Paddings size %d should be the same or twice as the "
+                          "pooling size %d.",
+                          paddings->size(),
+                          data_dims.size() * 2));
+  }
+
+  // when padding_algorithm is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      T out_size = (data_dims[i] + strides[i] - 1) / strides[i];
+      T pad_sum =
+          std::max((out_size - 1) * strides[i] + kernel_size[i] - data_shape[i],
+                   static_cast<T>(0));
+      T pad_0 = pad_sum / 2;
+      T pad_1 = pad_sum - pad_0;
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+    }
+  } else if (padding_algorithm == "VALID") {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+
+  // if global_pooling == true or adaptive == true, padding will be ignore
+  if (global_pooling || adaptive) {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+}
+
+template <typename T = int>
+inline void UpdateKernelSize(std::vector<T>* kernel_size,
+                             const DDim data_dims) {
+  kernel_size->resize(static_cast<size_t>(data_dims.size()));
+  for (size_t i = 0; i < kernel_size->size(); ++i) {
+    *(kernel_size->begin() + i) = static_cast<T>(data_dims[i]);
+  }
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index 5834f091d9a4d..85c371e9f9d45 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-// CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+// CUDA, XPU and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(__xpu__)
 
 #include <algorithm>
 #include <cmath>
@@ -220,7 +220,7 @@ struct IndexCalculator {
   phi::Array<int, kMaxRank> dims;
   phi::Array<int, kMaxRank> strides;
   phi::Array<int, kMaxRank> reduce_strides;
-#ifndef PADDLE_WITH_XPU2
+#ifndef PADDLE_WITH_XPU_KP
   phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
 #endif
 };
@@ -231,81 +231,65 @@ struct ReduceIndexMapping {
   HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims)
       : dim(dims) {}
 
+#ifdef PADDLE_WITH_XPU_KP
   __device__ __forceinline__ int BlockIdX() {
-#ifdef PADDLE_WITH_XPU2
     if (ReduceLastDim) {
       return (cluster_id() / dim.split_num_x % dim.split_num_y);
     } else {
       return cluster_id() % dim.split_num_x;
     }
-#else
-    return blockIdx.x;
-#endif
   }
 
   __device__ __forceinline__ int BlockIdY() {
-#ifdef PADDLE_WITH_XPU2
     if (ReduceLastDim) {
       return (cluster_id() % dim.split_num_x);
     } else {
       return (cluster_id() / dim.split_num_x % dim.split_num_y);
     }
-#else
-    return blockIdx.y;
-#endif
   }
 
-  __device__ __forceinline__ int BlockDimX() {
-#ifdef PADDLE_WITH_XPU2
-    return dim.deal_size_x;
-#else
-    return blockDim.x;
-#endif
-  }
+  __device__ __forceinline__ int BlockDimX() { return dim.deal_size_x; }
 
-  __device__ __forceinline__ int BlockDimY() {
-#ifdef PADDLE_WITH_XPU2
-    return 1;
-#else
-    return blockDim.y;
-#endif
-  }
+  __device__ __forceinline__ int BlockDimY() { return 1; }
 
   __device__ __forceinline__ int GridDimX() {
-#ifdef PADDLE_WITH_XPU2
     if (ReduceLastDim) {
       return dim.split_num_y;
     } else {
       return dim.split_num_x;
     }
-#else
-    return gridDim.x;
-#endif
   }
 
   __device__ __forceinline__ int GridDimY() {
-#ifdef PADDLE_WITH_XPU2
     if (ReduceLastDim) {
       return dim.split_num_x;
     } else {
       return dim.split_num_y;
     }
-#else
-    return gridDim.y;
-#endif
   }
 
   __device__ __forceinline__ int GetLoopSize() {
-#ifdef PADDLE_WITH_XPU2
     if (ReduceLastDim) {
       return dim.deal_size_y;
     } else {
       return dim.deal_size_x;
     }
+  }
 #else
-    return 1;
+  __device__ __forceinline__ int BlockIdX() { return blockIdx.x; }
+
+  __device__ __forceinline__ int BlockIdY() { return blockIdx.y; }
+
+  __device__ __forceinline__ int BlockDimX() { return blockDim.x; }
+
+  __device__ __forceinline__ int BlockDimY() { return blockDim.y; }
+
+  __device__ __forceinline__ int GridDimX() { return gridDim.x; }
+
+  __device__ __forceinline__ int GridDimY() { return gridDim.y; }
+
+  __device__ int GetLoopSize() { return 1; }
 #endif
-  }
 };
 
 // when reduce_type == kReduceLastDim this struct will be used
@@ -341,7 +325,7 @@ struct ReduceConfig {
 
   // when should_reduce_again is true, we need malloc temp space for temp data
   void SetOutputData(Ty* y_data,
-                     const phi::GPUContext& dev_ctx,
+                     const KPDevice& dev_ctx,
                      phi::DenseTensor* tmp) {
     if (should_reduce_again) {
       tmp->Resize(phi::make_ddim(
@@ -640,9 +624,7 @@ struct ReduceConfig {
   int blocking_size;
   bool should_reduce_again;
   bool reduce_last_dim;
-
   Ty* output_data;
-
   dim3 block;
   dim3 grid;
 };
@@ -770,9 +752,10 @@ __global__ void ReduceAnyKernel(const Tx* x,
 
     kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
         &reduce_var, &reduce_var, reducer, reduce_last_dim);
-    if (need_store) {
-      y[store_offset + i] = static_cast<Ty>(reduce_var);
-    }
+
+    Ty result = static_cast<Ty>(reduce_var);
+    kps::details::WriteData<Ty>(
+        y + store_offset + i, &result, static_cast<int>(need_store));
   }
 }
 
@@ -882,30 +865,18 @@ static void LaunchReduceKernel(const Tx* x_data,
     dim.SetRem(config.reduce_num % config.block.x, 0, 0);
 
 #ifdef PADDLE_WITH_XPU_KP
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    OneDimIndexCal><<<8, 64, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
+    auto grid_num = 8;
+    auto block_num = 64;
 #else
+    auto grid_num = config.grid;
+    auto block_num = config.block;
+#endif
     ReduceAnyKernel<Tx,
                     Ty,
                     MPType,
                     ReduceOp,
                     TransformOp,
-                    OneDimIndexCal><<<config.grid, config.block, 0, stream>>>(
+                    OneDimIndexCal><<<grid_num, block_num, 0, stream>>>(
         x_data,
         config.output_data,
         reducer,
@@ -917,7 +888,6 @@ static void LaunchReduceKernel(const Tx* x_data,
         reduce_index_calculator,
         left_index_calculator,
         dim);
-#endif
 
   } else {
     int reduce_rank = config.reduce_strides.size();
@@ -938,30 +908,18 @@ static void LaunchReduceKernel(const Tx* x_data,
     dim.SetRem(config.reduce_num % config.block.x, 0, 0);
 
 #ifdef PADDLE_WITH_XPU_KP
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    IndexCalculator><<<8, 64, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
+    auto grid_num = 8;
+    auto block_num = 64;
 #else
+    auto grid_num = config.grid;
+    auto block_num = config.block;
+#endif
     ReduceAnyKernel<Tx,
                     Ty,
                     MPType,
                     ReduceOp,
                     TransformOp,
-                    IndexCalculator><<<config.grid, config.block, 0, stream>>>(
+                    IndexCalculator><<<grid_num, block_num, 0, stream>>>(
         x_data,
         config.output_data,
         reducer,
@@ -973,7 +931,6 @@ static void LaunchReduceKernel(const Tx* x_data,
         reduce_index_calculator,
         left_index_calculator,
         dim);
-#endif
   }
 
   if (config.should_reduce_again) {
@@ -993,22 +950,9 @@ static void LaunchReduceKernel(const Tx* x_data,
         kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
     dim.SetRem(config.left_num % block.x, 0, 0);
 #ifdef PADDLE_WITH_XPU_KP
-    ReduceHigherDimKernel<
-        Ty,
-        Ty,
-        MPType,
-        ReduceOp,
-        kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
-        config.output_data,
-        y_data,
-        reducer,
-        kps::IdentityFunctor<Ty, MPType>(),
-        init,
-        config.grid.y,
-        config.left_num,
-        config.grid.y,
-        dim);
-#else
+    grid = 8;
+    block = 64;
+#endif
     ReduceHigherDimKernel<
         Ty,
         Ty,
@@ -1024,7 +968,6 @@ static void LaunchReduceKernel(const Tx* x_data,
         config.left_num,
         config.grid.y,
         dim);
-#endif
   }
 }
 
@@ -1038,7 +981,7 @@ CubTensorReduceImpl(const Tx* x_data,
                     Ty* y_data,
                     const TransformOp& transform,
                     int reduce_num,
-                    const phi::GPUContext& dev_ctx,
+                    const KPDevice& dev_ctx,
                     KPStream stream) {
   auto reducer = ReduceOp<Ty>();
   cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
@@ -1077,7 +1020,7 @@ CubTensorReduceImpl(const Tx* x_data,
                     Ty* y_data,
                     const TransformOp& transform,
                     int reduce_num,
-                    const phi::GPUContext& dev_ctx,
+                    const KPDevice& dev_ctx,
                     KPStream stream) {
   PADDLE_THROW(phi::errors::InvalidArgument(
       "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
@@ -1087,12 +1030,16 @@ template <typename Tx,
           typename Ty,
           template <typename> class ReduceOp,
           typename TransformOp>
-void ReduceKernel(const phi::GPUContext& dev_ctx,
+void ReduceKernel(const KPDevice& dev_ctx,
                   const phi::DenseTensor& x,
                   phi::DenseTensor* y,
                   const TransformOp& transform,
                   const std::vector<int>& origin_reduce_dims) {
+#ifdef PADDLE_WITH_XPU_KP
+  auto stream = dev_ctx.x_context()->xpu_stream;
+#else
   auto stream = dev_ctx.stream();
+#endif
   dev_ctx.Alloc<Ty>(y);
 
   auto x_dim = phi::vectorize<int>(x.dims());
@@ -1149,11 +1096,17 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
                0);
 
 #ifdef PADDLE_WITH_XPU_KP
+    auto grid_num = 8;
+    auto block_num = 64;
+#else
+    auto grid_num = config.grid;
+    auto block_num = config.block;
+#endif
     ReduceHigherDimKernel<Tx,
                           Ty,
                           MPType,
                           ReduceOp<MPType>,
-                          TransformOp><<<8, 64, 0, stream>>>(
+                          TransformOp><<<grid_num, block_num, 0, stream>>>(
         x_data,
         config.output_data,
         reducer,
@@ -1163,23 +1116,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
         config.left_num,
         config.blocking_size,
         dim);
-#else
-    ReduceHigherDimKernel<
-        Tx,
-        Ty,
-        MPType,
-        ReduceOp<MPType>,
-        TransformOp><<<config.grid, config.block, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        reducer.initial(),
-        config.reduce_num,
-        config.left_num,
-        config.blocking_size,
-        dim);
-#endif
 
     if (config.should_reduce_again) {
       dim3 block = dim3(config.block.x, 1, 1);
@@ -1189,22 +1125,9 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
       dim2.SetRem(config.left_num % config.block.x, 0, 0);
 
 #ifdef PADDLE_WITH_XPU_KP
-      ReduceHigherDimKernel<
-          Ty,
-          Ty,
-          MPType,
-          ReduceOp<MPType>,
-          kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
-          config.output_data,
-          y_data,
-          reducer,
-          kps::IdentityFunctor<Ty, MPType>(config.grid.y),
-          reducer.initial(),
-          config.grid.y,
-          config.left_num,
-          config.grid.y,
-          dim2);
-#else
+      grid = 8;
+      block = 64;
+#endif
       ReduceHigherDimKernel<
           Ty,
           Ty,
@@ -1220,7 +1143,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
           config.left_num,
           config.grid.y,
           dim2);
-#endif
     }
     return;
   }
diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h
index 4e83d0fa37103..9bf1bfecabbf2 100644
--- a/paddle/phi/kernels/funcs/reduce_functor.h
+++ b/paddle/phi/kernels/funcs/reduce_functor.h
@@ -17,11 +17,39 @@
 namespace phi {
 namespace funcs {
 
-//////// Sum Functor ///////
-struct SumFunctor {
+//////// Frobenius Norm Functor ///////
+struct FrobeniusNormFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
   void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->sum(dim);
+    y->device(place) = ((x->square()).sum(dim)).sqrt();
+  }
+};
+
+struct FrobeniusNormGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    dx->device(place) = y->broadcast(dim);
+    dx->device(place) = *dx + dx->constant(1e-12f);
+    dx->device(place) = (*x / *dx) * (dy->broadcast(dim));
+  }
+};
+
+//////// Max Functor ///////
+struct MaxFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->maximum(dim);
   }
 };
 
@@ -41,11 +69,112 @@ struct ProdFunctor {
   }
 };
 
-//////// Max Functor ///////
-struct MaxFunctor {
+//////// Sum Functor ///////
+struct SumFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
   void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->maximum(dim);
+    y->device(place) = x->sum(dim);
+  }
+};
+
+//////// Min Functor ///////
+struct MinFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->minimum(dim);
+  }
+};
+
+//////// All Functor ///////
+struct AllFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->all(dim);
+  }
+};
+
+//////// Any Functor ///////
+struct AnyFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->any(dim);
+  }
+};
+
+struct MeanGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
+  }
+};
+
+struct SumGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    dx->device(place) = dy->broadcast(dim);
+  }
+};
+
+struct ProdGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
+  }
+};
+
+struct MaxOrMinGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    auto equals = (*x) == y->broadcast(dim);
+    auto ones = dx->constant(1);
+    auto zeros = dx->constant(0);
+    // If there are multiple minimum or maximum elements, the subgradient of
+    // each is the set [0, 1], and we pass gradient to all of them here.
+    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/reduce_grad_functions.h b/paddle/phi/kernels/funcs/reduce_grad_functions.h
new file mode 100644
index 0000000000000..11197a52261d7
--- /dev/null
+++ b/paddle/phi/kernels/funcs/reduce_grad_functions.h
@@ -0,0 +1,177 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+namespace phi {
+
+namespace funcs {
+
+// This ReduceGradFunctor is only the CPU implement.
+template <typename Context, typename T, size_t D, typename Functor>
+void ReduceGradFunctor(const Context& dev_ctx,
+                       const DenseTensor& input0,
+                       const DenseTensor& input1,
+                       const DenseTensor& input2,
+                       DenseTensor* output,
+                       Functor functor,
+                       const std::vector<int>& dims) {
+  auto x = phi::EigenTensor<T, D>::From(input0);
+  auto x_grad = phi::EigenTensor<T, D>::From(*output);
+  auto x_rank = static_cast<int>(x.dimensions().size());
+  auto x_dims = input0.dims();
+  auto reduced_dims_v = phi::vectorize(x_dims);
+  std::vector<int> dims_ref = dims;
+  Eigen::array<int, D> broadcast_dim;
+  for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
+
+  int broad_cast_times = 1;
+  for (size_t i = 0; i < dims_ref.size(); ++i) {
+    if (dims_ref[i] < 0) {
+      dims_ref[i] = x_rank + dims_ref[i];
+    }
+    reduced_dims_v[dims_ref[i]] = 1;
+    broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
+    broad_cast_times *= x_dims[dims_ref[i]];
+  }
+  auto reduced_dims = phi::make_ddim(reduced_dims_v);
+  auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
+  auto x_reduce_grad = EigenTensor<T, D>::From(input2, reduced_dims);
+
+  auto& place = *dev_ctx.eigen_device();
+
+  functor(place,
+          &x,
+          &x_reduce,
+          &x_grad,
+          &x_reduce_grad,
+          broadcast_dim,
+          broad_cast_times);
+}
+
+inline void GetOriginDimFromShuffled(const DDim& src_dim,
+                                     const std::vector<int>& dims,
+                                     std::vector<int>* origin_dim) {
+  DDim shuffled_dims(src_dim);
+  size_t n = src_dim.size();
+  std::vector<int> perm_axis(n);
+  std::vector<int64_t> dims_64{dims.begin(), dims.end()};
+  GetShuffledDim(src_dim, &shuffled_dims, dims_64, &perm_axis);
+  for (size_t i = 0; i < n; ++i) {
+    (*origin_dim)[perm_axis[i]] = i;
+  }
+}
+
+template <typename Context, typename T, typename Functor>
+void HandleLargeDimGrad(const Context& dev_ctx,
+                        const DenseTensor* x,
+                        const DenseTensor* out,
+                        const DenseTensor* dout,
+                        DenseTensor* dx,
+                        Functor functor,
+                        const std::vector<int>& dims) {
+  const int64_t unreduced = out->numel();
+  const int64_t reduced = x->numel() / unreduced;
+  DDim out_dim(out->dims());
+  DDim x_dim(x->dims());
+  // transpose and reshape X
+  DenseTensor shuffled_x;
+  std::vector<int64_t> dims_64{dims.begin(), dims.end()};
+  GetShuffledInput<Context, T>(dev_ctx, *x, &shuffled_x, dims_64);
+  DDim shuffled_dim = shuffled_x.dims();
+  shuffled_x.Resize({unreduced, reduced});
+  // reshape dX {unreduced, reduced}
+  dx->Resize({unreduced, reduced});
+  ReduceGradFunctor<Context, T, 2, Functor>(
+      dev_ctx, shuffled_x, *out, *dout, dx, functor, {1});
+  // transpose dX
+  std::vector<int> origin_axis(x_dim.size());
+  GetOriginDimFromShuffled(x_dim, dims, &origin_axis);
+  DenseTensor dx_tmp;
+  paddle::framework::TensorCopy(*dx, dev_ctx.GetPlace(), &dx_tmp);
+  dx_tmp.Resize(shuffled_dim);
+  dx->Resize(x_dim);
+  phi::funcs::TransposeNormal<Context, T> trans;
+  trans(dev_ctx, dx_tmp, dx, origin_axis);
+}
+
+// Only for CPU
+template <typename Context, typename T, typename Functor>
+void LaunchReduceGradKernel(const Context& dev_ctx,
+                            const DenseTensor* input0,
+                            const DenseTensor* input1,
+                            const DenseTensor* input2,
+                            DenseTensor* output,
+                            Functor functor,
+                            const std::vector<int>& dims,
+                            bool reduce_all = false) {
+  if (reduce_all) {
+    auto x = phi::EigenVector<T>::Flatten(*input0);
+    auto x_reduce = phi::EigenVector<T>::Flatten(*input1);
+    auto x_reduce_grad = phi::EigenVector<T>::Flatten(*input2);
+    auto x_grad = phi::EigenVector<T>::Flatten(*output);
+    auto& place = *dev_ctx.eigen_device();
+    // *dev_ctx.eigen_device();
+    auto broadcast_dim =
+        Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
+    functor(place,
+            &x,
+            &x_reduce,
+            &x_grad,
+            &x_reduce_grad,
+            broadcast_dim,
+            broadcast_dim[0]);
+  } else {
+    int rank = input0->dims().size();
+    switch (rank) {
+      case 1:
+        ReduceGradFunctor<Context, T, 1, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      case 2:
+        ReduceGradFunctor<Context, T, 2, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      case 3:
+        ReduceGradFunctor<Context, T, 3, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      case 4:
+        ReduceGradFunctor<Context, T, 4, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      case 5:
+        ReduceGradFunctor<Context, T, 5, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      case 6:
+        ReduceGradFunctor<Context, T, 6, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      default:
+        HandleLargeDimGrad<Context, T, Functor>(
+            dev_ctx, input0, input1, input2, output, functor, dims);
+        break;
+    }
+  }
+}
+
+}  // namespace funcs
+
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/segment_pooling.cc b/paddle/phi/kernels/funcs/segment_pooling.cc
similarity index 61%
rename from paddle/fluid/operators/math/segment_pooling.cc
rename to paddle/phi/kernels/funcs/segment_pooling.cc
index d16fc570a9fb0..fbd744430aa11 100644
--- a/paddle/fluid/operators/math/segment_pooling.cc
+++ b/paddle/phi/kernels/funcs/segment_pooling.cc
@@ -12,45 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/math/segment_pooling.h"
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
 
 #include <string>
-#include "paddle/fluid/framework/eigen.h"
 
-namespace paddle {
-namespace operators {
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
 
-using Tensor = framework::Tensor;
+namespace phi {
+namespace funcs {
+
+using Tensor = DenseTensor;
 
 template <typename T, typename IndexT>
-class SegmentPoolFunctor<platform::CPUDeviceContext, T, IndexT> {
+class SegmentPoolFunctor<phi::CPUContext, T, IndexT> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& segments, framework::Tensor* output,
-                  framework::Tensor* index,
+  void operator()(const phi::CPUContext& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& segments,
+                  DenseTensor* output,
+                  DenseTensor* index,
                   const std::string pooltype = "SUM") {
     const IndexT* segment_ids = segments.data<IndexT>();
     auto curent_id = segment_ids[0];
     int64_t last_idx = 0;
     int64_t w = input.numel() / input.dims()[0];
-    auto& place = *context.eigen_device();
+    auto& place = *dev_ctx.eigen_device();
     for (int64_t idx = 1; idx <= segments.numel(); ++idx) {
       if (idx < segments.numel()) {
         if (segment_ids[idx] == curent_id) continue;
-        PADDLE_ENFORCE_GE(segment_ids[idx], curent_id,
-                          platform::errors::InvalidArgument(
+        PADDLE_ENFORCE_GE(segment_ids[idx],
+                          curent_id,
+                          phi::errors::InvalidArgument(
                               "The segment ids should be sorted, but got "
                               "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-                              idx - 1, curent_id, idx, segment_ids[idx]));
+                              idx - 1,
+                              curent_id,
+                              idx,
+                              segment_ids[idx]));
       }
 
       Tensor out_t = output->Slice(curent_id, curent_id + 1);
       Tensor in_t = input.Slice(last_idx, idx);
 
       int64_t h = idx - last_idx;
-      auto in_e = framework::EigenMatrix<T>::From(in_t, phi::make_ddim({h, w}));
-      auto out_e = framework::EigenVector<T>::Flatten(out_t);
+      auto in_e = EigenMatrix<T>::From(in_t, phi::make_ddim({h, w}));
+      auto out_e = EigenVector<T>::Flatten(out_t);
 
       auto reduce_dim = Eigen::array<int, 1>({{0}});
       if (pooltype == "MEAN") {
@@ -62,7 +69,7 @@ class SegmentPoolFunctor<platform::CPUDeviceContext, T, IndexT> {
       } else if (pooltype == "MIN") {
         out_e.device(place) = in_e.minimum(reduce_dim);
       } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "Unsupported segment pooling type, only MEAN, SUM, MAX, MIN "
             "available, but got %s.",
             pooltype));
@@ -75,36 +82,41 @@ class SegmentPoolFunctor<platform::CPUDeviceContext, T, IndexT> {
 };
 
 template <typename T, typename IndexT>
-class SegmentPoolGradFunctor<platform::CPUDeviceContext, T, IndexT> {
+class SegmentPoolGradFunctor<phi::CPUContext, T, IndexT> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& out_grad,
-                  const framework::Tensor& segments, framework::Tensor* in_grad,
-                  const framework::Tensor* index = nullptr,
+  void operator()(const phi::CPUContext& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& out_grad,
+                  const DenseTensor& segments,
+                  DenseTensor* in_grad,
+                  paddle::optional<const DenseTensor&> index,
                   const std::string pooltype = "SUM") {
     const IndexT* segment_ids = segments.data<IndexT>();
-    auto& place = *context.eigen_device();
+    auto& place = *dev_ctx.eigen_device();
     auto curent_id = segment_ids[0];
     int64_t last_idx = 0;
     int64_t w = in_grad->numel() / in_grad->dims()[0];
     for (int64_t idx = 1; idx <= segments.numel(); ++idx) {
       if (idx < segments.numel()) {
         if (segment_ids[idx] == curent_id) continue;
-        PADDLE_ENFORCE_GE(segment_ids[idx], curent_id,
-                          platform::errors::InvalidArgument(
+        PADDLE_ENFORCE_GE(segment_ids[idx],
+                          curent_id,
+                          phi::errors::InvalidArgument(
                               "The segment ids should be sorted, but got "
                               "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-                              idx - 1, curent_id, idx, segment_ids[idx]));
+                              idx - 1,
+                              curent_id,
+                              idx,
+                              segment_ids[idx]));
       }
 
       Tensor out_g_t = out_grad.Slice(curent_id, curent_id + 1);
       Tensor in_g_t = in_grad->Slice(last_idx, idx);
 
       int64_t h = idx - last_idx;
-      auto in_g_e = framework::EigenMatrix<T>::From(in_g_t, {h, w});
-      auto out_g_e = framework::EigenMatrix<T>::From(out_g_t, {1, w});
+      auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
+      auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
       Eigen::DSizes<int, 2> bcast(h, 1);
 
       if (pooltype == "MEAN") {
@@ -114,13 +126,13 @@ class SegmentPoolGradFunctor<platform::CPUDeviceContext, T, IndexT> {
       } else if (pooltype == "MAX" || pooltype == "MIN") {
         Tensor out_t = output.Slice(curent_id, curent_id + 1);
         Tensor in_t = input.Slice(last_idx, idx);
-        auto in_e = framework::EigenMatrix<T>::From(in_t, {h, w});
-        auto out_e = framework::EigenMatrix<T>::From(out_t, {1, w});
+        auto in_e = EigenMatrix<T>::From(in_t, {h, w});
+        auto out_e = EigenMatrix<T>::From(out_t, {1, w});
         in_g_e.device(place) =
             (in_e == out_e.broadcast(bcast)).template cast<T>() *
             out_g_e.broadcast(bcast);
       } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "Unsupported segment pooling type, only MEAN, SUM, MAX, MIN "
             "available, but got %s.",
             pooltype));
@@ -132,15 +144,24 @@ class SegmentPoolGradFunctor<platform::CPUDeviceContext, T, IndexT> {
   }
 };
 
-using CPU = platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 template class SegmentPoolFunctor<CPU, float, int>;
 template class SegmentPoolFunctor<CPU, float, int64_t>;
 template class SegmentPoolFunctor<CPU, double, int>;
 template class SegmentPoolFunctor<CPU, double, int64_t>;
+template class SegmentPoolFunctor<CPU, int, int>;
+template class SegmentPoolFunctor<CPU, int, int64_t>;
+template class SegmentPoolFunctor<CPU, int64_t, int>;
+template class SegmentPoolFunctor<CPU, int64_t, int64_t>;
+
 template class SegmentPoolGradFunctor<CPU, float, int>;
 template class SegmentPoolGradFunctor<CPU, float, int64_t>;
 template class SegmentPoolGradFunctor<CPU, double, int>;
 template class SegmentPoolGradFunctor<CPU, double, int64_t>;
+template class SegmentPoolGradFunctor<CPU, int, int>;
+template class SegmentPoolGradFunctor<CPU, int, int64_t>;
+template class SegmentPoolGradFunctor<CPU, int64_t, int>;
+template class SegmentPoolGradFunctor<CPU, int64_t, int64_t>;
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu
similarity index 53%
rename from paddle/fluid/operators/math/segment_pooling.cu
rename to paddle/phi/kernels/funcs/segment_pooling.cu
index fbdcb99c02ab9..95606b1526729 100644
--- a/paddle/fluid/operators/math/segment_pooling.cu
+++ b/paddle/phi/kernels/funcs/segment_pooling.cu
@@ -12,20 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
+
 #include <algorithm>
-#include "paddle/fluid/operators/math/segment_pooling.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
+namespace funcs {
 
-using Tensor = framework::Tensor;
+using Tensor = DenseTensor;
 
 template <typename T, typename Index, int DimTileSize>
-__global__ void SegmentSumIdsKernel(const Index* segment_ids, T* summed_ids,
+__global__ void SegmentSumIdsKernel(const Index* segment_ids,
+                                    T* summed_ids,
                                     const Index input_length_size,
                                     const Index total_stripe_count) {
   CUDA_KERNEL_LOOP(stripe_index, total_stripe_count) {
@@ -45,16 +49,19 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids, T* summed_ids,
       PADDLE_ENFORCE(current_segment_id >= last_segment_id,
                      "the segment ids should be sorted, but got "
                      "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-                     dim_index_base + j - 1, dim_index_base + j,
-                     last_segment_id, current_segment_id);
+                     dim_index_base + j - 1,
+                     dim_index_base + j,
+                     last_segment_id,
+                     current_segment_id);
       if (current_segment_id > last_segment_id) {
         for (Index interval_id = last_segment_id + 1;
-             interval_id < current_segment_id; ++interval_id) {
+             interval_id < current_segment_id;
+             ++interval_id) {
           *(summed_ids + interval_id) = 0;
         }
         if (j > 0) {
           if (last_segment_id == first_segment_id) {
-            platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+            paddle::platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
           } else {
             *(summed_ids + last_segment_id) = sum;
           }
@@ -64,13 +71,15 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids, T* summed_ids,
       sum += T(1);
       last_segment_id = current_segment_id;
     }
-    platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+    paddle::platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
   }
 }
 
 template <typename T, typename Index, int DimTileSize>
-__global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
-                                  T* output, T* summed_ids,
+__global__ void SegmentMeanKernel(const Index* segment_ids,
+                                  const T* input,
+                                  T* output,
+                                  T* summed_ids,
                                   const Index input_length_size,
                                   const Index inner_dim_size,
                                   const Index output_length_size,
@@ -93,7 +102,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
       if (current_segment_id > last_segment_id) {
         // reset the interval value which do not have corresponding ids.
         for (Index interval_id = last_segment_id + 1;
-             interval_id < current_segment_id; ++interval_id) {
+             interval_id < current_segment_id;
+             ++interval_id) {
           *(output + interval_id * inner_dim_size + segment_offset) = T(0);
         }
 
@@ -102,8 +112,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
               last_segment_id * inner_dim_size + segment_offset;
 
           if (last_segment_id == first_segment_id) {
-            platform::CudaAtomicAdd(output + output_index,
-                                    sum / *(summed_ids + last_segment_id));
+            paddle::platform::CudaAtomicAdd(
+                output + output_index, sum / *(summed_ids + last_segment_id));
           } else {
             *(output + output_index) = sum / *(summed_ids + last_segment_id);
           }
@@ -114,15 +124,14 @@ __global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
       last_segment_id = current_segment_id;
     }
     Index output_index = last_segment_id * inner_dim_size + segment_offset;
-    platform::CudaAtomicAdd(output + output_index,
-                            sum / *(summed_ids + last_segment_id));
+    paddle::platform::CudaAtomicAdd(output + output_index,
+                                    sum / *(summed_ids + last_segment_id));
   }
 }
 
 template <typename T, typename Index, typename Helper, typename Pool>
-__global__ void __launch_bounds__(1024, 1)
-    SegmentOpsKernel(const Index* segment_ids, const T* input, T* output,
-                     Helper h, Pool pool) {
+__global__ void __launch_bounds__(1024, 1) SegmentOpsKernel(
+    const Index* segment_ids, const T* input, T* output, Helper h, Pool pool) {
   CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
     Index segment_offset, dim_index_base, actual_height;
     Index inner_dim_size = h.inner_dim_size;
@@ -142,13 +151,16 @@ __global__ void __launch_bounds__(1024, 1)
       PADDLE_ENFORCE(current_segment_id >= last_segment_id,
                      "The segment ids should be sorted, but got "
                      "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-                     dim_index_base + j - 1, dim_index_base + j,
-                     last_segment_id, current_segment_id);
+                     dim_index_base + j - 1,
+                     dim_index_base + j,
+                     last_segment_id,
+                     current_segment_id);
 
       if (current_segment_id > last_segment_id) {
         // reset the interval value which do not have corresponding ids.
         for (Index interval_id = last_segment_id + 1;
-             interval_id < current_segment_id; ++interval_id) {
+             interval_id < current_segment_id;
+             ++interval_id) {
           *(output + interval_id * inner_dim_size + segment_offset) = T(0);
         }
         // don't update result when j=0
@@ -175,9 +187,12 @@ __global__ void __launch_bounds__(1024, 1)
 }
 
 template <typename T, typename Index, typename Helper>
-__global__ void SegmentIndexGradKernel(const Index* segment_ids, const T* input,
-                                       const T* output, const T* out_grad,
-                                       T* in_grad, Helper h) {
+__global__ void SegmentIndexGradKernel(const Index* segment_ids,
+                                       const T* input,
+                                       const T* output,
+                                       const T* out_grad,
+                                       T* in_grad,
+                                       Helper h) {
   CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
     Index segment_offset, dim_index_base, actual_height;
     h.calculate(stripe_index, &segment_offset, &dim_index_base, &actual_height);
@@ -201,7 +216,7 @@ class MaxPool {
   DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return platform::CudaAtomicMax(address, val);
+    return paddle::platform::CudaAtomicMax(address, val);
   }
 };
 
@@ -211,7 +226,7 @@ class MinPool {
   DEVICE inline T initial() { return static_cast<T>(FLT_MAX); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y < x ? *y : x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return platform::CudaAtomicMin(address, val);
+    return paddle::platform::CudaAtomicMin(address, val);
   }
 };
 
@@ -221,7 +236,7 @@ class SumPool {
   DEVICE inline T initial() { return static_cast<T>(0); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y + x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return platform::CudaAtomicAdd(address, val);
+    return paddle::platform::CudaAtomicAdd(address, val);
   }
 };
 
@@ -243,8 +258,10 @@ class ArrangeHelper {
     total_stripe_count = inner_dim_size * input_outer_dim_num_stripe;
   }
 
-  DEVICE inline void calculate(T stripe_index, T* segment_offset,
-                               T* dim_index_base, T* actual_height) {
+  DEVICE inline void calculate(T stripe_index,
+                               T* segment_offset,
+                               T* dim_index_base,
+                               T* actual_height) {
     *segment_offset = stripe_index % inner_dim_size;
     *dim_index_base = stripe_index / inner_dim_size * DimTileSize;
     *actual_height = min(DimTileSize, input_length_size - *dim_index_base);
@@ -252,23 +269,32 @@ class ArrangeHelper {
 };
 
 template <typename T, typename Index>
-void SegmentPoolCUDAGradFunctor(const platform::CUDADeviceContext& ctx,
-                                const framework::Tensor& input,
-                                const framework::Tensor& segment_ids,
-                                const framework::Tensor& output,
-                                const framework::Tensor& out_grad,
-                                framework::Tensor* in_grad,
+void SegmentPoolCUDAGradFunctor(const phi::GPUContext& ctx,
+                                const DenseTensor& input,
+                                const DenseTensor& segment_ids,
+                                const DenseTensor& output,
+                                const DenseTensor& out_grad,
+                                DenseTensor* in_grad,
                                 const std::string pooltype = "SUM") {
-  auto h = ArrangeHelper<Index>(input.numel(), segment_ids.dims()[0],
-                                output.dims()[0]);
-  auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
+  auto h = ArrangeHelper<Index>(
+      input.numel(), segment_ids.dims()[0], output.dims()[0]);
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
   if (pooltype == "MAX" || pooltype == "MIN") {
-    SegmentIndexGradKernel<T, Index, ArrangeHelper<Index>><<<
-        config.block_per_grid.x, config.thread_per_block.x, 0, ctx.stream()>>>(
-        segment_ids.data<Index>(), input.data<T>(), output.data<T>(),
-        out_grad.data<T>(), in_grad->data<T>(), h);
+    SegmentIndexGradKernel<T,
+                           Index,
+                           ArrangeHelper<Index>><<<config.block_per_grid.x,
+                                                   config.thread_per_block.x,
+                                                   0,
+                                                   ctx.stream()>>>(
+        segment_ids.data<Index>(),
+        input.data<T>(),
+        output.data<T>(),
+        out_grad.data<T>(),
+        in_grad->data<T>(),
+        h);
   } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
         "Unsupported segment pooling grad operation, Only MAX, MIN "
         "available, but got %s.",
         pooltype));
@@ -291,13 +317,13 @@ __global__ void SimpleDiv(T* x, const T* y, const int len, const int dim) {
 }
 
 template <typename T, typename IndexT>
-class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
+class SegmentPoolFunctor<phi::GPUContext, T, IndexT> {
  public:
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& input,
-                  const framework::Tensor& segment_ids,
-                  framework::Tensor* output,
-                  framework::Tensor* summed_ids = nullptr,
+  void operator()(const phi::GPUContext& ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& segment_ids,
+                  DenseTensor* output,
+                  DenseTensor* summed_ids = nullptr,
                   const std::string pooltype = "SUM") {
     if (pooltype == "MEAN") {
       // Sum the segment id num first
@@ -305,50 +331,76 @@ class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
       auto input_length_size = segment_ids.numel();
       auto total_stripe_count =
           (input_length_size + DimTileSize - 1) / DimTileSize;
-      auto config = platform::GetGpuLaunchConfig1D(ctx, total_stripe_count);
-      SegmentSumIdsKernel<
-          T, IndexT, IndexT(8)><<<config.block_per_grid.x,
-                                  config.thread_per_block.x, 0, ctx.stream()>>>(
-          segment_ids.data<IndexT>(), summed_ids->data<T>(), input_length_size,
+      auto config =
+          phi::backends::gpu::GetGpuLaunchConfig1D(ctx, total_stripe_count);
+      SegmentSumIdsKernel<T, IndexT, IndexT(8)><<<config.block_per_grid.x,
+                                                  config.thread_per_block.x,
+                                                  0,
+                                                  ctx.stream()>>>(
+          segment_ids.data<IndexT>(),
+          summed_ids->data<T>(),
+          input_length_size,
           total_stripe_count);
     }
 
-    auto h = ArrangeHelper<IndexT>(input.numel(), segment_ids.dims()[0],
-                                   output->dims()[0]);
-    auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
+    auto h = ArrangeHelper<IndexT>(
+        input.numel(), segment_ids.dims()[0], output->dims()[0]);
+    auto config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
     if (pooltype == "MEAN") {
-      SegmentMeanKernel<
-          T, IndexT, IndexT(8)><<<config.block_per_grid.x,
-                                  config.thread_per_block.x, 0, ctx.stream()>>>(
-          segment_ids.data<IndexT>(), input.data<T>(), output->data<T>(),
-          summed_ids->data<T>(), h.input_length_size, h.inner_dim_size,
-          h.output_length_size, h.total_stripe_count);
+      SegmentMeanKernel<T, IndexT, IndexT(8)><<<config.block_per_grid.x,
+                                                config.thread_per_block.x,
+                                                0,
+                                                ctx.stream()>>>(
+          segment_ids.data<IndexT>(),
+          input.data<T>(),
+          output->data<T>(),
+          summed_ids->data<T>(),
+          h.input_length_size,
+          h.inner_dim_size,
+          h.output_length_size,
+          h.total_stripe_count);
     } else if (pooltype == "SUM") {
       SumPool<T> pool;
-      SegmentOpsKernel<
-          T, IndexT, ArrangeHelper<IndexT>,
-          SumPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                        ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                        input.data<T>(), output->data<T>(), h,
-                                        pool);
+      SegmentOpsKernel<T,
+                       IndexT,
+                       ArrangeHelper<IndexT>,
+                       SumPool<T>><<<config.block_per_grid.x,
+                                     config.thread_per_block.x,
+                                     0,
+                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                                     input.data<T>(),
+                                                     output->data<T>(),
+                                                     h,
+                                                     pool);
     } else if (pooltype == "MAX") {
       MaxPool<T> pool;
-      SegmentOpsKernel<
-          T, IndexT, ArrangeHelper<IndexT>,
-          MaxPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                        ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                        input.data<T>(), output->data<T>(), h,
-                                        pool);
+      SegmentOpsKernel<T,
+                       IndexT,
+                       ArrangeHelper<IndexT>,
+                       MaxPool<T>><<<config.block_per_grid.x,
+                                     config.thread_per_block.x,
+                                     0,
+                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                                     input.data<T>(),
+                                                     output->data<T>(),
+                                                     h,
+                                                     pool);
     } else if (pooltype == "MIN") {
       MinPool<T> pool;
-      SegmentOpsKernel<
-          T, IndexT, ArrangeHelper<IndexT>,
-          MinPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                        ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                        input.data<T>(), output->data<T>(), h,
-                                        pool);
+      SegmentOpsKernel<T,
+                       IndexT,
+                       ArrangeHelper<IndexT>,
+                       MinPool<T>><<<config.block_per_grid.x,
+                                     config.thread_per_block.x,
+                                     0,
+                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                                     input.data<T>(),
+                                                     output->data<T>(),
+                                                     h,
+                                                     pool);
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
           "available, but got %s.",
           pooltype));
@@ -357,33 +409,38 @@ class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
 };
 
 template <typename T, typename IndexT>
-class SegmentPoolGradFunctor<platform::CUDADeviceContext, T, IndexT> {
+class SegmentPoolGradFunctor<phi::GPUContext, T, IndexT> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& out_grad,
-                  const framework::Tensor& segments, framework::Tensor* in_grad,
-                  const framework::Tensor* summed_ids = nullptr,
+  void operator()(const phi::GPUContext& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& out_grad,
+                  const DenseTensor& segments,
+                  DenseTensor* in_grad,
+                  paddle::optional<const DenseTensor&> summed_ids,
                   const std::string pooltype = "SUM") {
     if (pooltype == "MAX" || pooltype == "MIN") {
-      SegmentPoolCUDAGradFunctor<T, IndexT>(context, input, segments, output,
-                                            out_grad, in_grad, pooltype);
+      SegmentPoolCUDAGradFunctor<T, IndexT>(
+          dev_ctx, input, segments, output, out_grad, in_grad, pooltype);
     } else if (pooltype == "MEAN") {
-      framework::Tensor mean_grad;
-      mean_grad.mutable_data<T>(input.dims(), context.GetPlace());
-      framework::TensorCopy(out_grad, context.GetPlace(), context, &mean_grad);
+      DenseTensor mean_grad;
+      mean_grad.Resize(input.dims());
+      dev_ctx.template Alloc<T>(&mean_grad);
+      paddle::framework::TensorCopy(
+          out_grad, dev_ctx.GetPlace(), dev_ctx, &mean_grad);
       int len = output.dims()[0];
       int dim = output.numel() / len;
-      auto config = platform::GetGpuLaunchConfig1D(context, len);
-      SimpleDiv<T><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                     context.stream()>>>(mean_grad.data<T>(),
-                                         summed_ids->data<T>(), len, dim);
-      phi::funcs::GPUGather<T, IndexT>(context, mean_grad, segments, in_grad);
+      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len);
+      SimpleDiv<T><<<config.block_per_grid.x,
+                     config.thread_per_block.x,
+                     0,
+                     dev_ctx.stream()>>>(
+          mean_grad.data<T>(), summed_ids->data<T>(), len, dim);
+      phi::funcs::GPUGather<T, IndexT>(dev_ctx, mean_grad, segments, in_grad);
     } else if (pooltype == "SUM") {
-      phi::funcs::GPUGather<T, IndexT>(context, out_grad, segments, in_grad);
+      phi::funcs::GPUGather<T, IndexT>(dev_ctx, out_grad, segments, in_grad);
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
           "available, but got %s.",
           pooltype));
@@ -391,15 +448,24 @@ class SegmentPoolGradFunctor<platform::CUDADeviceContext, T, IndexT> {
   }
 };
 
-using CUDA = paddle::platform::CUDADeviceContext;
-template class SegmentPoolFunctor<CUDA, float, int>;
-template class SegmentPoolFunctor<CUDA, float, int64_t>;
-template class SegmentPoolFunctor<CUDA, double, int>;
-template class SegmentPoolFunctor<CUDA, double, int64_t>;
-template class SegmentPoolGradFunctor<CUDA, float, int>;
-template class SegmentPoolGradFunctor<CUDA, float, int64_t>;
-template class SegmentPoolGradFunctor<CUDA, double, int>;
-template class SegmentPoolGradFunctor<CUDA, double, int64_t>;
-
-}  // namespace operators
-}  // namespace paddle
+using GPU = phi::GPUContext;
+template class SegmentPoolFunctor<GPU, float, int>;
+template class SegmentPoolFunctor<GPU, float, int64_t>;
+template class SegmentPoolFunctor<GPU, double, int>;
+template class SegmentPoolFunctor<GPU, double, int64_t>;
+template class SegmentPoolFunctor<GPU, int, int>;
+template class SegmentPoolFunctor<GPU, int, int64_t>;
+template class SegmentPoolFunctor<GPU, int64_t, int>;
+template class SegmentPoolFunctor<GPU, int64_t, int64_t>;
+
+template class SegmentPoolGradFunctor<GPU, float, int>;
+template class SegmentPoolGradFunctor<GPU, float, int64_t>;
+template class SegmentPoolGradFunctor<GPU, double, int>;
+template class SegmentPoolGradFunctor<GPU, double, int64_t>;
+template class SegmentPoolGradFunctor<GPU, int, int>;
+template class SegmentPoolGradFunctor<GPU, int, int64_t>;
+template class SegmentPoolGradFunctor<GPU, int64_t, int>;
+template class SegmentPoolGradFunctor<GPU, int64_t, int64_t>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/segment_pooling.h b/paddle/phi/kernels/funcs/segment_pooling.h
similarity index 51%
rename from paddle/fluid/operators/math/segment_pooling.h
rename to paddle/phi/kernels/funcs/segment_pooling.h
index 561fad6921fe7..b8281061582ea 100644
--- a/paddle/fluid/operators/math/segment_pooling.h
+++ b/paddle/phi/kernels/funcs/segment_pooling.h
@@ -14,33 +14,36 @@ limitations under the License. */
 
 #pragma once
 #include <string>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
+namespace funcs {
 
-template <typename DeviceContext, typename T, typename IndexT>
+template <typename Context, typename T, typename IndexT>
 class SegmentPoolFunctor {
  public:
   /* mean pool has summed_ids output */
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& segments, framework::Tensor* output,
-                  framework::Tensor* summed_ids = nullptr,
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& segments,
+                  DenseTensor* output,
+                  DenseTensor* summed_ids = nullptr,
                   const std::string pooltype = "SUM");
 };
 
-template <typename DeviceContext, typename T, typename IndexT>
+template <typename Context, typename T, typename IndexT>
 class SegmentPoolGradFunctor {
  public:
   /* mean pool has summed_ids output */
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& out_grad,
-                  const framework::Tensor& segments, framework::Tensor* in_grad,
-                  const framework::Tensor* summed_ids = nullptr,
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& out_grad,
+                  const DenseTensor& segments,
+                  DenseTensor* in_grad,
+                  paddle::optional<const DenseTensor&> summed_ids,
                   const std::string pooltype = "SUM");
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
new file mode 100644
index 0000000000000..3a1d9b8ea7a7a
--- /dev/null
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -0,0 +1,447 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// CUDA and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include <algorithm>
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+
+namespace kps = phi::kps;
+
+namespace phi {
+namespace funcs {
+using Mode = kps::details::ReduceMode;
+
+/*
+* Count how many of the data being processed by the current block are true
+* 1. Load data from global memory and cast from bool to int64_t
+* 2. Get result of this thread according to thread reduce
+* 3. Get result of this block according to block reduce
+* 4. first block store 0 and current result
+*/
+template <typename T>
+struct NonZeroFunctor {
+  HOSTDEVICE NonZeroFunctor() {}
+  HOSTDEVICE inline T operator()(const T in) {
+    if (in) {
+      return static_cast<T>(1);
+    } else {
+      return static_cast<T>(0);
+    }
+  }
+};
+
+template <typename InT, typename OutT, int VecSize, int IsBoundary>
+__device__ void GetBlockCountImpl(const InT *in,
+                                  OutT *out,
+                                  int num,
+                                  int repeat) {
+  InT in_data[VecSize];
+  OutT temp[VecSize];
+  OutT result = static_cast<OutT>(0.0f);
+  using Add = kps::AddFunctor<OutT>;
+  using Cast = NonZeroFunctor<InT>;
+  int store_fix = BLOCK_ID_X + repeat * GRID_NUM_X;
+
+  kps::Init<InT, VecSize>(&in_data[0], static_cast<InT>(0.0f));
+  kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(&in_data[0], in, num);
+  kps::ElementwiseUnary<InT, OutT, VecSize, 1, 1, Cast>(
+      &temp[0], &in_data[0], Cast());
+  kps::Reduce<OutT, VecSize, 1, 1, Add, Mode::kLocalMode>(
+      &result, &temp[0], Add(), true);
+  kps::Reduce<OutT, 1, 1, 1, Add, Mode::kGlobalMode>(
+      &result, &result, Add(), true);
+  if (store_fix == 0) {
+    // first block's fix_size = 0;
+    OutT tmp = static_cast<OutT>(0.0f);
+    kps::WriteData<OutT, 1, 1, 1, true>(out + store_fix, &tmp, 1);
+  }
+
+  // store num of this block
+  kps::WriteData<OutT, 1, 1, 1, true>(out + store_fix + 1, &result, 1);
+}
+
+// Count how many data is not zero in current block
+template <typename InT, typename OutT, int VecSize>
+__global__ void GetBlockCountKernel(const InT *in,
+                                    OutT *out,
+                                    int64_t numel,
+                                    int64_t main_offset) {
+  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  int repeat = 0;
+  for (; data_offset < main_offset; data_offset += stride) {
+    GetBlockCountImpl<InT, OutT, VecSize, false>(
+        in + data_offset, out, BLOCK_NUM_X * VecSize, repeat);
+    repeat++;  // to get the real blockIdx
+  }
+
+  int num = numel - data_offset;
+  if (num > 0) {
+    GetBlockCountImpl<InT, OutT, VecSize, true>(
+        in + data_offset, out, num, repeat);
+  }
+}
+
+/*
+* Get block num prefix us one block, VecSize must be 2
+* 1. Each thread load 2 data : threadIdx.x and threadIdx.x + blockDimx.x
+* 2. Cumsum limitation is blockDim.x must be less than 512
+*/
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int VecSize,
+          bool IsBoundary>
+__device__ void CumsumImpl(
+    const InT *in, OutT *out, OutT *pre_cumsum, int num, Functor func) {
+  __shared__ OutT max_thread_data;
+  OutT temp[VecSize];
+  InT arg[VecSize];
+  OutT result[VecSize];
+  // init data_pr
+  kps::Init<InT, VecSize>(&arg[0], static_cast<InT>(0.0f));
+  // set pre_cumsum
+  kps::Init<OutT, VecSize>(&temp[0], *pre_cumsum);
+  // load data to arg
+  kps::ReadData<InT, InT, VecSize, 1, 1, IsBoundary>(
+      &arg[0], in, num, 1, BLOCK_NUM_X, 1);
+  // block cumsum
+  kps::Cumsum<InT, OutT, 1, Functor>(&result[0], &arg[0], func);
+  // result = cumsum_result + pre_cumsum
+  kps::ElementwiseBinary<OutT, OutT, VecSize, 1, 1, Functor>(
+      &result[0], &result[0], &temp[0], func);
+  // get the last prefix sum
+  if ((THREAD_ID_X == BLOCK_NUM_X - 1) && !IsBoundary) {
+    max_thread_data = result[VecSize - 1];
+  }
+  __syncthreads();
+  // update pre_cumsum
+  *pre_cumsum = max_thread_data;
+  kps::WriteData<OutT, OutT, VecSize, 1, 1, IsBoundary>(
+      out, &result[0], num, 1, BLOCK_NUM_X, 1);
+}
+
+// Compute this store_offset of this block
+template <typename InT, typename OutT, typename Functor, int VecSize>
+__global__ void CumsumOneBlock(
+    const InT *in, OutT *out, int numel, int main_offset, Functor func) {
+  int stride = BLOCK_NUM_X * VecSize;
+  int offset = 0;
+  OutT pre_cumsum = static_cast<OutT>(0);
+  for (; offset < main_offset; offset += stride) {
+    CumsumImpl<InT, OutT, Functor, VecSize, false>(
+        in + offset, out + offset, &pre_cumsum, BLOCK_NUM_X * VecSize, func);
+  }
+
+  int num = numel - offset;
+  if (num > 0) {
+    CumsumImpl<InT, OutT, Functor, VecSize, true>(
+        in + offset, out + offset, &pre_cumsum, num, func);
+  }
+}
+
+template <typename OutT,
+          typename MT,
+          typename InT,
+          typename IdT,
+          typename Functor,
+          int VecSize,
+          int IsBoundary,
+          int IsMaskData>
+struct SelectCaller {
+  __device__ void inline operator()(OutT *store_data,
+                                    const MT *mask_data,
+                                    const InT *in,
+                                    Functor func,
+                                    int num,
+                                    int data_offset) {
+    // where_index op
+    IdT index_reg[VecSize];
+    // Set data index of global
+    kps::InitWithDataIndex<IdT, VecSize, 1, 1>(&index_reg[0], data_offset);
+    // Get store data according to mask_idt
+    kps::OperatorTernary<MT, IdT, OutT, Functor>(
+        store_data, mask_data, &index_reg[0], func, VecSize);
+  }
+};
+
+template <typename OutT,
+          typename MT,
+          typename InT,
+          typename IdT,
+          typename Functor,
+          int VecSize,
+          int IsBoundary>
+struct SelectCaller<OutT,
+                    MT,
+                    InT,
+                    IdT,
+                    Functor,
+                    VecSize,
+                    IsBoundary,
+                    1> {  // masked_select
+  __device__ void inline operator()(OutT *store_data,
+                                    const MT *mask_data,
+                                    const InT *in,
+                                    Functor func,
+                                    int num,
+                                    int data_offset) {
+    InT in_data[VecSize];
+    kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(&in_data[0], in, num);
+    // Get store data according to mask_idt
+    kps::OperatorTernary<MT, InT, OutT, Functor>(
+        store_data, mask_data, &in_data[0], func, VecSize);
+  }
+};
+
+/**
+* Get mask's index if mask == true
+*/
+template <typename InT,
+          typename MT,
+          typename OutT,
+          typename Functor,
+          int VecSize,
+          int MaskData,
+          int IsBoundary>  // SelectType = 1 Mask_select else where_index
+__device__ void
+SelectKernelImpl(OutT *out,
+                 const MT *mask,
+                 const InT *in,
+                 Functor func,
+                 int num,
+                 int data_offset,
+                 int store_rank) {
+  const int kCVecSize = 2;
+  // each thread cumsum 2 data
+  using IdT = int64_t;
+  // Set index data type
+  using Add = kps::AddFunctor<IdT>;  // for cumsum
+  using Cast = NonZeroFunctor<InT>;  // for mask
+
+  IdT init_idx = static_cast<IdT>(0.0f);
+  MT init_mask = static_cast<MT>(0.0f);
+
+  IdT num_thread[kCVecSize];
+  IdT cumsum_thread[kCVecSize];
+
+  OutT store_data[VecSize * phi::DDim::kMaxRank];
+  MT mask_data[VecSize];
+  IdT mask_idt[VecSize];
+  // init data_pr
+  kps::Init<IdT, kCVecSize>(&cumsum_thread[0], init_idx);
+  kps::Init<IdT, kCVecSize>(&num_thread[0], init_idx);
+  kps::Init<MT, VecSize>(&mask_data[0], init_mask);
+  // Load mask
+  kps::ReadData<MT, VecSize, 1, 1, IsBoundary>(&mask_data[0], mask, num);
+  // Cast from MT to int
+  kps::ElementwiseUnary<MT, IdT, VecSize, 1, 1, Cast>(
+      &mask_idt[0], &mask_data[0], Cast());
+  // Get the num of thread only num_thread[1] has data
+  kps::Reduce<IdT, VecSize, 1, 1, Add, Mode::kLocalMode>(
+      &num_thread[0], &mask_idt[0], Add(), true);
+  // Get cumsum_thread cumsum from 0 to num_thread cumsum_thread[0] is the
+  // thread_fix
+  kps::Cumsum<IdT, IdT, 1, Add>(&cumsum_thread[0], &num_thread[0], Add());
+  // Get store data(index) according to mask_idt
+  SelectCaller<OutT, MT, InT, IdT, Functor, VecSize, IsBoundary, MaskData>
+      compute;
+  compute(&store_data[0], &mask_data[0], in, func, num, data_offset);
+  // get thread_fix
+  int thread_fix =
+      (static_cast<int>(cumsum_thread[0] - num_thread[0]) * store_rank);
+  // get how many data need to store
+  int store_num = static_cast<int>(num_thread[0]) * store_rank;
+  // thread store num data, each thread may has different num
+  kps::details::WriteData<OutT>(out + thread_fix, &store_data[0], store_num);
+}
+
+template <typename MT,
+          typename InT,
+          typename CT,
+          typename OutT,
+          typename Functor,
+          int VecSize,
+          int MaskData>
+__global__ void SelectKernel(OutT *out,
+                             const MT *mask,
+                             const InT *in,
+                             CT *cumsum,
+                             Functor func,
+                             const int64_t numel,
+                             int64_t main_offset,
+                             int store_rank) {
+  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  int repeat = 0;
+  int size = VecSize * BLOCK_ID_X;
+  for (; data_offset < main_offset; data_offset += stride) {
+    // Cumsum index
+    int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X;
+    // niuliling todo: us ReadData API
+    int block_store_offset = cumsum[idx_cumsum];
+    SelectKernelImpl<InT, MT, OutT, Functor, VecSize, MaskData, false>(
+        out + block_store_offset * store_rank,
+        mask + data_offset,
+        in + data_offset,
+        func,
+        size,
+        data_offset,
+        store_rank);
+    repeat++;
+  }
+
+  int num = numel - data_offset;
+  if (num > 0) {
+    // Cumsum index
+    int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X;
+    // niuliling todo: us ReadData API
+    int block_store_offset = static_cast<int>(cumsum[idx_cumsum]);
+    SelectKernelImpl<InT, MT, OutT, Functor, VecSize, MaskData, true>(
+        out + block_store_offset * store_rank,
+        mask + data_offset,
+        in + data_offset,
+        func,
+        num,
+        data_offset,
+        store_rank);
+  }
+}
+
+inline int64_t Floor(int64_t in, int64_t div) { return in / div * div; }
+
+// SelectData = 1 then masked_select; SelectData = 0 then where_index
+template <typename MT,
+          typename InT,
+          typename OutT,
+          int SelectData,
+          typename Functor>
+void SelectKernel(const KPDevice &dev_ctx,
+                  const DenseTensor &condition,
+                  const DenseTensor &in_data,
+                  DenseTensor *out,
+                  Functor func) {
+  const MT *cond_data = condition.data<MT>();
+  const int64_t numel = condition.numel();
+  auto dims = condition.dims();
+  int rank = SelectData ? 1 : dims.size();
+  const InT *in_data_ptr = SelectData ? in_data.data<InT>() : nullptr;
+  // calculate the inclusive prefix sum of "true_num_array"
+  // to get the index of "out" tensor,
+  // and the total number of cond_data[i]==true.
+  // Example:
+  // condition: F T T F F F T T
+  // before:    0 1 1 0 0 0 1 1
+  // after:     0 1 2 2 2 2 3 4
+  // out:       1 2 6 7
+  // alloc for cpu
+  using CT = int64_t;  // set Count_data Type
+  const int t_size = sizeof(CT);
+
+  const paddle::platform::CUDAPlace &cuda_place = dev_ctx.GetPlace();
+  paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace();
+
+  // 1.1 get stored data num of per block
+  int total_true_num = 0;  // init
+  const int kVecSize = 4;
+#ifdef PADDLE_WITH_XPU_KP
+  int block = 64;
+  auto stream = dev_ctx.x_context()->xpu_stream;
+  const int num_per_block = kVecSize * block;
+  const int need_grids = (numel + num_per_block - 1) / num_per_block;
+  const int grid = std::min(need_grids, 8);
+#else
+  const int block = 256;
+  const int num_per_block = kVecSize * block;
+  const int need_grids = (numel + num_per_block - 1) / num_per_block;
+  const int grid = std::min(need_grids, 256);
+  auto stream = dev_ctx.stream();
+#endif
+  const int64_t main_offset = Floor(numel, num_per_block);
+  // 1.2 alloc tmp data for CoutBlock
+  const int size_count_block = need_grids + 1;
+  std::vector<int> dims_vec = {size_count_block * 2};
+  ScalarArray dims_array(dims_vec);
+  DenseTensor count_mem = phi::Empty<CT, KPDevice>(dev_ctx, dims_array);
+  CT *count_data = count_mem.data<CT>();
+  // 1.3 launch CountKernl
+  GetBlockCountKernel<MT, CT, kVecSize><<<grid, block, 0, stream>>>(
+      cond_data, count_data, numel, main_offset);
+  // 2.1 alloc cumsum data for CoutBlock prefix
+  DenseTensor cumsum_mem = phi::Empty<CT, KPDevice>(dev_ctx, dims_array);
+  CT *cumsum_data = cumsum_mem.data<CT>();
+  // 2.2 get prefix of count_data for real out_index
+  const int kCumVesize = 2;
+  const int block_c = 256;
+  const int main_offset_c = Floor(size_count_block, (kCumVesize * block_c));
+  using Add = kps::AddFunctor<CT>;
+  CumsumOneBlock<CT, CT, Add, kCumVesize><<<1, block_c, 0, stream>>>(
+      count_data, cumsum_data, size_count_block, main_offset_c, Add());
+  // 3.1 set temp ptr for in;
+  // 3.1 alloc for out
+  // 3.1.1 get true_num for gpu place the last cumsum is the true_num
+  paddle::memory::Copy(cpu_place,
+                       &total_true_num,
+                       cuda_place,
+                       cumsum_data + need_grids,
+                       t_size,
+                       dev_ctx.stream());
+
+  dev_ctx.Wait();
+  // 3.1.2 allock for out with total_true_num
+  std::vector<int64_t> out_dim = {static_cast<int64_t>(total_true_num)};
+  if (SelectData == 0) {  // where_index
+    out_dim.push_back(rank);
+  }
+  out->Resize(phi::make_ddim(out_dim));
+  auto out_data = out->mutable_data<OutT>(cuda_place);
+  // 3.2 get true data's index according to cond_data and cumsum_data
+  if (total_true_num <= 0) return;
+  SelectKernel<MT,
+               InT,
+               CT,
+               OutT,
+               Functor,
+               kVecSize,
+               SelectData><<<grid, block, 0, stream>>>(out_data,
+                                                       cond_data,
+                                                       in_data_ptr,
+                                                       cumsum_data,
+                                                       func,
+                                                       numel,
+                                                       main_offset,
+                                                       rank);
+}
+
+}  // namespace funcs
+}  // namespace phi
+
+#endif
diff --git a/paddle/phi/kernels/funcs/slice.h b/paddle/phi/kernels/funcs/slice.h
index 0a50dceb0a007..38b127541650b 100644
--- a/paddle/phi/kernels/funcs/slice.h
+++ b/paddle/phi/kernels/funcs/slice.h
@@ -123,5 +123,56 @@ DenseTensor Slice(const Context& dev_ctx,
   return ret;
 }
 
+// Use in conv_transpose kernel
+template <typename Context, typename T, size_t D>
+static void Slice(const Context& ctx,
+                  const DenseTensor* input,
+                  DenseTensor* out,
+                  const std::vector<int64_t>& begin_vec,
+                  const std::vector<int64_t>& end_vec,
+                  const std::vector<int64_t>& axes_vec) {
+  auto& place = *ctx.eigen_device();
+  auto in_dims = input->dims();
+  auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
+  auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
+  for (size_t i = 0; i < D; ++i) {
+    offsets[i] = 0;
+    extents[i] = in_dims[i];
+  }
+
+  std::vector<int64_t> out_shape_vec = vectorize(in_dims);
+  for (size_t i = 0; i < axes_vec.size(); ++i) {
+    offsets[axes_vec[i]] = begin_vec[i];
+    extents[axes_vec[i]] = end_vec[i] - begin_vec[i];
+    out_shape_vec[axes_vec[i]] = end_vec[i] - begin_vec[i];
+  }
+
+  DDim out_dims(make_ddim(out_shape_vec));
+  out->Resize(out_dims);
+  ctx.template Alloc<T>(out);
+
+  auto in_t =
+      EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(*input);
+  auto out_t = EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
+      *out, out_dims);
+
+  funcs::EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
+      place, out_t, in_t, offsets, extents);
+  out->Resize(out_dims);
+}
+
+template <typename Context, typename T, size_t D>
+static void Slice(const Context& ctx,
+                  const DenseTensor* input,
+                  DenseTensor* out,
+                  int64_t begin_idx,
+                  int64_t end_idx,
+                  int64_t axes) {
+  std::vector<int64_t> begin_vec = {begin_idx};
+  std::vector<int64_t> end_vec = {end_idx};
+  std::vector<int64_t> axes_vec = {axes};
+  Slice<Context, T, D>(ctx, input, out, begin_vec, end_vec, axes_vec);
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/sparse/common_shape.h b/paddle/phi/kernels/funcs/sparse/common_shape.h
new file mode 100644
index 0000000000000..3617e3cd2f406
--- /dev/null
+++ b/paddle/phi/kernels/funcs/sparse/common_shape.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "paddle/phi/core/ddim.h"
+
+namespace phi {
+namespace funcs {
+namespace sparse {
+
+inline const DDim InferDenseDims(const DDim& x_dims,
+                                 const int64_t sparse_dim,
+                                 const int64_t non_zero_num) {
+  auto dense_dim = x_dims.size() - sparse_dim;
+  DDim values_dims;
+  if (dense_dim > 0) {
+    std::vector<int64_t> dense_dim_vec(dense_dim + 1);
+    dense_dim_vec[0] = non_zero_num;
+    memcpy(&dense_dim_vec[1],
+           x_dims.Get() + sparse_dim,
+           dense_dim * sizeof(x_dims[0]));
+    values_dims = phi::make_ddim(dense_dim_vec);
+  } else {
+    values_dims = phi::make_ddim({non_zero_num});
+  }
+  return values_dims;
+}
+
+}  // namespace sparse
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h
new file mode 100644
index 0000000000000..19f1f3d3cd2fa
--- /dev/null
+++ b/paddle/phi/kernels/funcs/sparse/convolution.h
@@ -0,0 +1,190 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+namespace funcs {
+namespace sparse {
+
+struct Dims4D {
+  int dims[4];
+  Dims4D(const int batch, const int x, const int y, const int z) {
+    dims[0] = batch;
+    dims[1] = z;
+    dims[2] = y;
+    dims[3] = x;
+  }
+  HOSTDEVICE const int& operator[](int i) const { return dims[i]; }
+};
+
+// Judge whether the current position x is in (lower, upper)
+inline HOSTDEVICE bool Check(const int& x,
+                             const int& kx,
+                             const int& pad,
+                             const int& stride,
+                             const int dilation,
+                             const int kdim,
+                             const int xdim) {
+  const int lower = x - dilation * kx + pad;
+  const int uper = x + (kdim - kx - 1) * dilation - pad;
+  return (lower >= 0 && lower % stride == 0 && uper < xdim);
+}
+
+// Check whether the current position(x, y, z) is legal:
+// Judge the minimum and maximum values at each latitude
+inline HOSTDEVICE bool Check(const Dims4D& dims,
+                             const Dims4D& kernel_dims,
+                             const Dims4D& paddings,
+                             const Dims4D& dilations,
+                             const Dims4D& strides,
+                             const int x,
+                             const int y,
+                             const int z,
+                             const int kx,
+                             const int ky,
+                             const int kz) {
+  bool x_valid = Check(
+      x, kx, paddings[3], strides[3], dilations[3], kernel_dims[3], dims[3]);
+  bool y_valid = Check(
+      y, ky, paddings[2], strides[2], dilations[2], kernel_dims[2], dims[2]);
+  bool z_valid = Check(
+      z, kz, paddings[1], strides[1], dilations[1], kernel_dims[1], dims[1]);
+  return (x_valid && y_valid && z_valid);
+}
+
+template <typename Dim>
+inline HOSTDEVICE int PointToIndex(const int& batch,
+                                   const int& x,
+                                   const int& y,
+                                   const int& z,
+                                   const Dim& dims) {
+  return batch * dims[1] * dims[2] * dims[3] + z * dims[2] * dims[3] +
+         y * dims[3] + x;
+}
+
+// TODO(zhangkaihuo): use division and multiply to optimize
+// modulo operation
+template <typename Dim>
+inline HOSTDEVICE void IndexToPoint(
+    const int index, const Dim& dims, int* batch, int* x, int* y, int* z) {
+  int n = index;
+  *x = n % dims[3];
+  n /= dims[3];
+  *y = n % dims[2];
+  n /= dims[2];
+  *z = n % dims[1];
+  n /= dims[1];
+  *batch = n;
+}
+
+inline void GetOutShape(const DDim& x_dims,
+                        const std::vector<int>& kernel_sizes,
+                        const std::vector<int>& paddings,
+                        const std::vector<int>& dilations,
+                        const std::vector<int>& strides,
+                        DDim* out_dims) {
+  PADDLE_ENFORCE_EQ(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument("the shape of x should be (N, D, H, W, C)"));
+  PADDLE_ENFORCE_EQ(kernel_sizes.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "the shape of kernel should be (D, H, W, C, OC)"));
+
+  // infer out shape
+  (*out_dims)[0] = x_dims[0];
+  (*out_dims)[4] = kernel_sizes[4];
+  for (int i = 1; i < 4; i++) {
+    (*out_dims)[i] = (x_dims[i] + 2 * paddings[i - 1] -
+                      dilations[i - 1] * (kernel_sizes[i - 1] - 1) - 1) /
+                         strides[i - 1] +
+                     1;
+  }
+}
+
+inline void ResetSubmKernelSizeAndStrides(const DDim& kernel_dims,
+                                          std::vector<int>* paddings,
+                                          std::vector<int>* strides) {
+  for (uint64_t i = 0; i < paddings->size(); i++) {
+    (*paddings)[i] = kernel_dims[i] / 2;
+    (*strides)[i] = 1;
+  }
+}
+
+template <typename T, typename Context>
+inline void SubmPreProcess(const Context& dev_ctx,
+                           const SparseCooTensor& x,
+                           const DenseTensor& kernel,
+                           const DenseTensor& out_grad,
+                           const int in_channels,
+                           const int out_channels,
+                           const int half_kernel_size,
+                           DenseTensor* kernel_grad,
+                           DenseTensor* x_grad) {
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  T* d_kernel_ptr = kernel_grad->data<T>();
+  blas.GEMM(CblasTrans,
+            CblasNoTrans,
+            x.non_zero_elements().dims()[1],
+            out_grad.dims()[1],
+            x.non_zero_elements().dims()[0],
+            static_cast<T>(1),
+            x.non_zero_elements().data<T>(),
+            out_grad.data<T>(),
+            static_cast<T>(0),
+            d_kernel_ptr + half_kernel_size * in_channels * out_channels);
+
+  // call gemm: d_x = out_grad * transpose(kernel)
+  // (n, out_channels) * (out_channels, in_channels)
+  T* x_grad_ptr = x_grad->data<T>();
+  blas.GEMM(CblasNoTrans,
+            CblasTrans,
+            out_grad.dims()[0],
+            in_channels,
+            out_grad.dims()[1],
+            static_cast<T>(1),
+            out_grad.data<T>(),
+            kernel.data<T>() + half_kernel_size * in_channels * out_channels,
+            static_cast<T>(0),
+            x_grad_ptr);
+}
+
+inline const std::vector<int> PoolResetKernel(
+    const std::vector<int>& kernel_sizes,
+    const int in_channels,
+    const int out_channels) {
+  std::vector<int> res(kernel_sizes);
+  res.resize(5);
+  res[3] = in_channels;
+  res[4] = out_channels;
+  return res;
+}
+
+inline void PrefixSum(const int* counter, int* offsets, const int n) {
+  int offset = 0;
+  for (int i = 0; i < n; i++) {
+    offsets[i] = offset;
+    offset += counter[i];
+  }
+  offsets[n] = offset;
+}
+
+}  // namespace sparse
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/tril_triu_compute.h b/paddle/phi/kernels/funcs/tril_triu_compute.h
new file mode 100644
index 0000000000000..d2b6f1e559d2b
--- /dev/null
+++ b/paddle/phi/kernels/funcs/tril_triu_compute.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+class TrilTriuCompute {
+ public:
+  HOSTDEVICE TrilTriuCompute(const T* in,
+                             const int diagonal,
+                             const bool lower,
+                             const int64_t H,
+                             const int64_t W,
+                             T* out)
+      : in_(in), diagonal_(diagonal), lower_(lower), H_(H), W_(W), out_(out) {}
+
+  HOSTDEVICE void operator()(int64_t idx) {
+    const int64_t row = (idx / W_) % H_;
+    const int64_t col = idx % W_;
+    const bool mask =
+        lower_ ? (col - row > diagonal_) : (col - row < diagonal_);
+    out_[idx] = mask ? static_cast<T>(0) : in_[idx];
+  }
+
+ private:
+  const T* in_;
+  const int diagonal_;
+  const bool lower_;
+  const int64_t H_;
+  const int64_t W_;
+  T* out_;
+};
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h
index b3189fc5cc3c3..336e9c809427c 100644
--- a/paddle/phi/kernels/funcs/values_vectors_functor.h
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -20,7 +20,6 @@
 #endif  // PADDLE_WITH_CUDA
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
diff --git a/paddle/phi/kernels/funcs/viterbi_decode_functor.h b/paddle/phi/kernels/funcs/viterbi_decode_functor.h
new file mode 100644
index 0000000000000..b80fd5356b6e8
--- /dev/null
+++ b/paddle/phi/kernels/funcs/viterbi_decode_functor.h
@@ -0,0 +1,140 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_MKLML
+#include <omp.h>
+#endif
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+namespace funcs {
+
+static std::vector<DenseTensor> Unbind(const DenseTensor& in) {
+  int64_t size = in.dims()[0];
+  std::vector<DenseTensor> tensors(size);
+  for (int64_t i = 0; i < size; ++i) {
+    tensors[i] = in.Slice(i, i + 1);
+  }
+  return tensors;
+}
+
+template <typename T, typename Functor, typename OutT = T>
+void SameDimsBinaryOP(const DenseTensor& lhs,
+                      const DenseTensor& rhs,
+                      DenseTensor* out) {
+  const T* lhs_ptr = lhs.data<T>();
+  const T* rhs_ptr = rhs.data<T>();
+  OutT* out_ptr = out->data<OutT>();
+  Functor functor;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int i = 0; i < out->numel(); ++i) {
+    out_ptr[i] = functor(lhs_ptr[i], rhs_ptr[i]);
+  }
+}
+
+template <bool is_multi_threads>
+struct GetInputIndex {
+  void operator()(const std::vector<int>& lhs_dims,
+                  const std::vector<int>& rhs_dims,
+                  const std::vector<int>& output_dims,
+                  const std::vector<int>& lhs_strides,
+                  const std::vector<int>& rhs_strides,
+                  const std::vector<int>& output_strides,
+                  int output_idx,
+                  int* index_array,
+                  int* lhs_idx,
+                  int* rhs_idx) {
+    int out_dims_size = output_strides.size();
+    for (int j = 0; j < out_dims_size; ++j) {
+      int curr_idx = output_idx / output_strides[j];
+      output_idx %= output_strides[j];
+      *lhs_idx += (lhs_dims[j] > 1) ? curr_idx * lhs_strides[j] : 0;
+      *rhs_idx += (rhs_dims[j] > 1) ? curr_idx * rhs_strides[j] : 0;
+    }
+  }
+};
+
+template <typename T, typename Functor, bool is_multi_threads = false>
+void SimpleBroadcastBinaryOP(const DenseTensor& lhs,
+                             const DenseTensor& rhs,
+                             DenseTensor* out) {
+  const T* lhs_ptr = lhs.data<T>();
+  const T* rhs_ptr = rhs.data<T>();
+  T* out_ptr = out->data<T>();
+  int out_size = static_cast<int>(out->dims().size());
+  std::vector<int> out_dims(out_size);
+  std::vector<int> lhs_dims(out_size);
+  std::vector<int> rhs_dims(out_size);
+  std::copy(lhs.dims().Get(), lhs.dims().Get() + out_size, lhs_dims.data());
+  std::copy(rhs.dims().Get(), rhs.dims().Get() + out_size, rhs_dims.data());
+  std::copy(out->dims().Get(), out->dims().Get() + out_size, out_dims.data());
+  std::vector<int> output_strides(out_size, 1);
+  std::vector<int> lhs_strides(out_size, 1);
+  std::vector<int> rhs_strides(out_size, 1);
+  std::vector<int> index_array(out_size, 0);
+  // calculate strides
+  for (int i = out_size - 2; i >= 0; --i) {
+    output_strides[i] = output_strides[i + 1] * out_dims[i + 1];
+    lhs_strides[i] = lhs_strides[i + 1] * lhs_dims[i + 1];
+    rhs_strides[i] = rhs_strides[i + 1] * rhs_dims[i + 1];
+  }
+  Functor functor;
+  GetInputIndex<is_multi_threads> get_input_index;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int i = 0; i < out->numel(); ++i) {
+    int lhs_idx = 0;
+    int rhs_idx = 0;
+    get_input_index(lhs_dims,
+                    rhs_dims,
+                    out_dims,
+                    lhs_strides,
+                    rhs_strides,
+                    output_strides,
+                    i,
+                    index_array.data(),
+                    &lhs_idx,
+                    &rhs_idx);
+    out_ptr[i] = functor(lhs_ptr[lhs_idx], rhs_ptr[rhs_idx]);
+  }
+}
+
+class TensorBuffer {
+ public:
+  explicit TensorBuffer(const DenseTensor& in) : buffer_(in), offset_(0) {
+    buffer_.Resize({buffer_.numel()});
+  }
+  DenseTensor GetBufferBlock(std::initializer_list<int64_t> shape) {
+    int64_t size = std::accumulate(
+        shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
+    DenseTensor block = buffer_.Slice(offset_, offset_ + size);
+    offset_ += size;
+    block.Resize(shape);
+    return block;
+  }
+
+ private:
+  DenseTensor buffer_;  // need to resize 1-D Tensor
+  int offset_;
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gather_grad_kernel.h b/paddle/phi/kernels/gather_grad_kernel.h
new file mode 100644
index 0000000000000..e53da7b471c7b
--- /dev/null
+++ b/paddle/phi/kernels/gather_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& index,
+                      const DenseTensor& out_grad,
+                      const Scalar& axis,
+                      bool overwrite,
+                      DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gather_kernel.h b/paddle/phi/kernels/gather_kernel.h
new file mode 100644
index 0000000000000..78ac09125b692
--- /dev/null
+++ b/paddle/phi/kernels/gather_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& index,
+                  const Scalar& axis,
+                  DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gather_tree_kernel.h b/paddle/phi/kernels/gather_tree_kernel.h
index e5a1a684daef0..b3e6ffbc4297a 100644
--- a/paddle/phi/kernels/gather_tree_kernel.h
+++ b/paddle/phi/kernels/gather_tree_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
+
 namespace phi {
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gelu_grad_kernel.h b/paddle/phi/kernels/gelu_grad_kernel.h
new file mode 100644
index 0000000000000..fd70e8d54bc8d
--- /dev/null
+++ b/paddle/phi/kernels/gelu_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES  // use M_2_SQRTPI on Windows
+#endif
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GeluGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    bool approximate,
+                    DenseTensor* x_grad);
+}  // namespace phi
diff --git a/paddle/phi/kernels/gelu_kernel.h b/paddle/phi/kernels/gelu_kernel.h
new file mode 100644
index 0000000000000..bc106a04031fb
--- /dev/null
+++ b/paddle/phi/kernels/gelu_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES  // use M_2_SQRTPI on Windows
+#endif
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+#define GELU_CONSTANT 0.044715
+
+template <typename T, typename Context>
+void GeluKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                bool approximate,
+                DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index c2995c79a7e8c..c912d0c4686ff 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -73,119 +73,158 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
   }
 }
 
-#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \
+#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \
   template <typename T, typename Context>                           \
   void name##GradKernel(const Context& dev_ctx,                     \
                         const DenseTensor& x,                       \
                         const DenseTensor& dout,                    \
                         DenseTensor* dx) {                          \
-    functor_class functor;                                          \
-    ActivationGradGPUImpl<T, Context, functor_class>(               \
+    funcs::functor_class<T> functor;                                \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>(     \
         dev_ctx, &x, nullptr, &dout, dx, functor);                  \
   }
 
-#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(         \
+    name, functor_class, attr)                                  \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& x,                   \
+                        const DenseTensor& dout,                \
+                        float attr,                             \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr;                                  \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);              \
+  }
+
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(         \
+    name, functor_class, attr1, attr2)                          \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& x,                   \
+                        const DenseTensor& dout,                \
+                        float attr1,                            \
+                        float attr2,                            \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr1;                                 \
+    *(attrs[1].second) = attr2;                                 \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);              \
+  }
+
+#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \
   template <typename T, typename Context>                             \
   void name##GradKernel(const Context& dev_ctx,                       \
                         const DenseTensor& out,                       \
                         const DenseTensor& dout,                      \
                         DenseTensor* dx) {                            \
-    functor_class functor;                                            \
-    ActivationGradGPUImpl<T, Context, functor_class>(                 \
+    funcs::functor_class<T> functor;                                  \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>(       \
         dev_ctx, nullptr, &out, &dout, dx, functor);                  \
   }
 
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::CudaReluGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CudaCosGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::CudaTanGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::CudaAcosGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::CudaSinGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::CudaAsinGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::CudaAtanGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::CudaSinhGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CudaCoshGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::CudaAsinhGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::CudaAcoshGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::CudaAtanhGradFunctor<T>);
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(       \
+    name, functor_class, attr)                                  \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& out,                 \
+                        const DenseTensor& dout,                \
+                        float attr,                             \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr;                                  \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, nullptr, &out, &dout, dx, functor);            \
+  }
+
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(       \
+    name, functor_class, attr1, attr2)                          \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& out,                 \
+                        const DenseTensor& dout,                \
+                        float attr1,                            \
+                        float attr2,                            \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr1;                                 \
+    *(attrs[1].second) = attr2;                                 \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, nullptr, &out, &dout, dx, functor);            \
+  }
+
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, CudaReluGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, CudaTanhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, CudaSigmoidGradFunctor);
+
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CudaCosGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, CudaTanGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, CudaAcosGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Sin, CudaSinGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Asin, CudaAsinGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atan, CudaAtanGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Sinh, CudaSinhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Cosh, CudaCoshGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Asinh, CudaAsinhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, CudaAcoshGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, CudaLogSigmoidGradFunctor);
+
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
+                                               CudaLeakyReluGradFunctor,
+                                               alpha);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu,
+                                               CudaThresholdedReluGradFunctor,
+                                               threshold);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
+                                               CudaSoftShrinkGradFunctor,
+                                               lambda);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
+                                               CudaHardShrinkGradFunctor,
+                                               threshold);
+
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
+                                               CudaBReluGradFunctor,
+                                               t_min,
+                                               t_max);
+
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
+                                                 CudaHardSigmoidGradFunctor,
+                                                 slope,
+                                                 offset);
+
+template <typename T, typename Context>
+void EluGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out,
+                   const DenseTensor& dout,
+                   float alpha,
+                   DenseTensor* dx) {
+  dev_ctx.template Alloc<T>(dx);
+  std::vector<const DenseTensor*> ins = {&dout, &out};
+  std::vector<DenseTensor*> outs = {dx};
+  if (alpha > 0) {
+    funcs::CudaELUGradFunctor<T> functor;
+    functor.alpha = alpha;
+    funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+  } else {
+    funcs::CudaELUGradNegativeAlphaFunctor<T> functor;
+    functor.alpha = alpha;
+    ins.push_back(&x);
+    funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+  }
+}
 
 }  // namespace phi
-PD_REGISTER_KERNEL(cos_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::CosGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(tan_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::TanGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(acos_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AcosGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(sin_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SinGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(asin_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AsinGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(atan_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AtanGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(sinh_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SinhGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(cosh_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::CoshGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(asinh_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AsinhGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(acosh_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AcoshGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(atanh_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AtanhGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
+
 #ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(relu_grad,
                    GPU,
@@ -219,3 +258,45 @@ PD_REGISTER_KERNEL(relu_double_grad,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
 #endif
+
+#define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(name,                             \
+                     GPU,                              \
+                     ALL_LAYOUT,                       \
+                     phi::func,                        \
+                     float,                            \
+                     double,                           \
+                     phi::dtype::float16,              \
+                     phi::dtype::bfloat16) {}
+
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sin_grad, SinGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(cos_grad, CosGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tan_grad, TanGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(acos_grad, AcosGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(asin_grad, AsinGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(atan_grad, AtanGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sinh_grad, SinhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(cosh_grad, CoshGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(asinh_grad, AsinhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(acosh_grad, AcoshGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(atanh_grad, AtanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_grad, TanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_double_grad, TanhDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_triple_grad, TanhTripleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad,
+                                   LeakyReluDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
+                                   ThresholdedReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel)
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index 26752b89e7c34..6b598c764debb 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -38,26 +38,77 @@ void ActivationGPUImpl(const Context& dev_ctx,
   funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
 }
 
-#define DEFINE_GPU_ACTIVATION_KERNEL(name, functor_class)                   \
-  template <typename T, typename Context>                                   \
-  void name##Kernel(                                                        \
-      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {     \
-    functor_class functor;                                                  \
-    ActivationGPUImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \
+#define DEFINE_GPU_ACTIVATION_KERNEL(name, functor_class)               \
+  template <typename T, typename Context>                               \
+  void name##Kernel(                                                    \
+      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \
+    funcs::functor_class<T> functor;                                    \
+    ActivationGPUImpl<T, Context, funcs::functor_class<T>>(             \
+        dev_ctx, x, out, functor);                                      \
   }
 
-DEFINE_GPU_ACTIVATION_KERNEL(Cos, funcs::CudaCosFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Tan, funcs::CudaTanFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Acos, funcs::CudaAcosFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Sin, funcs::CudaSinFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Asin, funcs::CudaAsinFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Atan, funcs::CudaAtanFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Sinh, funcs::CudaSinhFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Cosh, funcs::CudaCoshFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Asinh, funcs::CudaAsinhFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Acosh, funcs::CudaAcoshFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Atanh, funcs::CudaAtanhFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Relu, funcs::CudaReluFunctor<T>)
+#define DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \
+  template <typename T, typename Context>                               \
+  void name##Kernel(const Context& dev_ctx,                             \
+                    const DenseTensor& x,                               \
+                    float attr,                                         \
+                    DenseTensor* out) {                                 \
+    funcs::functor_class<T> functor;                                    \
+    auto attrs = functor.GetAttrs();                                    \
+    *(attrs[0].second) = attr;                                          \
+    ActivationGPUImpl<T, Context, funcs::functor_class<T>>(             \
+        dev_ctx, x, out, functor);                                      \
+  }
+
+#define DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(               \
+    name, functor_class, attr1, attr2)                      \
+  template <typename T, typename Context>                   \
+  void name##Kernel(const Context& dev_ctx,                 \
+                    const DenseTensor& x,                   \
+                    float attr1,                            \
+                    float attr2,                            \
+                    DenseTensor* out) {                     \
+    funcs::functor_class<T> functor;                        \
+    auto attrs = functor.GetAttrs();                        \
+    *(attrs[0].second) = attr1;                             \
+    *(attrs[1].second) = attr2;                             \
+    ActivationGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, x, out, functor);                          \
+  }
+
+DEFINE_GPU_ACTIVATION_KERNEL(Cos, CudaCosFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Tan, CudaTanFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Acos, CudaAcosFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Sin, CudaSinFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Asin, CudaAsinFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Atan, CudaAtanFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Sinh, CudaSinhFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Cosh, CudaCoshFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Asinh, CudaAsinhFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Acosh, CudaAcoshFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Atanh, CudaAtanhFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Relu, CudaReluFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Tanh, CudaTanhFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(TanhShrink, CudaTanhShrinkFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Silu, CudaSiluFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Sigmoid, CudaSigmoidFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(LogSigmoid, CudaLogSigmoidFunctor)
+
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
+                                     CudaThresholdedReluFunctor,
+                                     threshold)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink,
+                                     CudaHardShrinkFunctor,
+                                     threshold)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, CudaSoftShrinkFunctor, lambda)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, CudaELUFunctor, alpha)
+
+DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max)
+DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
+                                     CudaHardSigmoidFunctor,
+                                     slope,
+                                     offset)
 
 }  // namespace phi
 
@@ -79,65 +130,37 @@ PD_REGISTER_KERNEL(relu,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
 #endif
-PD_REGISTER_KERNEL(
-    sin, GPU, ALL_LAYOUT, phi::SinKernel, float, double, phi::dtype::float16) {}
-PD_REGISTER_KERNEL(
-    cos, GPU, ALL_LAYOUT, phi::CosKernel, float, double, phi::dtype::float16) {}
-PD_REGISTER_KERNEL(
-    tan, GPU, ALL_LAYOUT, phi::TanKernel, float, double, phi::dtype::float16) {}
-PD_REGISTER_KERNEL(acos,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AcosKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(asin,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AsinKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(atan,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AtanKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(sinh,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SinhKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(cosh,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::CoshKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(asinh,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AsinhKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(acosh,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AcoshKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(atanh,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AtanhKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
+
+#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(name,                        \
+                     GPU,                         \
+                     ALL_LAYOUT,                  \
+                     phi::func,                   \
+                     float,                       \
+                     double,                      \
+                     phi::dtype::float16,         \
+                     phi::dtype::bfloat16) {}
+
+PD_REGISTER_ACTIVATION_KERNEL(sin, SinKernel)
+PD_REGISTER_ACTIVATION_KERNEL(cos, CosKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tan, TanKernel)
+PD_REGISTER_ACTIVATION_KERNEL(acos, AcosKernel)
+PD_REGISTER_ACTIVATION_KERNEL(asin, AsinKernel)
+PD_REGISTER_ACTIVATION_KERNEL(atan, AtanKernel)
+PD_REGISTER_ACTIVATION_KERNEL(sinh, SinhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(cosh, CoshKernel)
+PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel)
+PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel)
diff --git a/paddle/phi/kernels/gpu/allclose_kernel.cu b/paddle/phi/kernels/gpu/allclose_kernel.cu
new file mode 100644
index 0000000000000..af2612bb10c9f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/allclose_kernel.cu
@@ -0,0 +1,89 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/allclose_kernel.h"
+
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void AllcloseCUDAKernel(const T* in_data,
+                                   const T* other_data,
+                                   const double rtol,
+                                   const double atol,
+                                   bool equal_nan,
+                                   int num,
+                                   bool* out_data) {
+  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  bool val;
+  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
+    const T a = in_data[i], b = other_data[i];
+    if (isnan(a) || isnan(b)) {
+      val = equal_nan && isnan(a) == isnan(b);
+    } else {
+      T left = (a > b ? a - b : b - a);
+      T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
+      T diff = (left > right ? left - right : right - left);
+      val = a == b || left <= right || diff <= 1e-15;
+    }
+    if (!val) *out_data = false;
+  }
+}
+
+template <typename T, typename Context>
+void AllCloseKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const Scalar& rtol,
+                    const Scalar& atol,
+                    bool equal_nan,
+                    DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      rtol.dtype(),
+      DataType::FLOAT64,
+      phi::errors::InvalidArgument(
+          "Input (Rtol) type must be double, but get %s.", rtol.dtype()));
+  PADDLE_ENFORCE_EQ(
+      atol.dtype(),
+      DataType::FLOAT64,
+      phi::errors::InvalidArgument(
+          "Input (Atol) type must be double, but get %s.", atol.dtype()));
+
+  const T* in_data = x.data<T>();
+  const T* other_data = y.data<T>();
+  auto rtol_v = rtol.to<double>();
+  auto atol_v = atol.to<double>();
+  bool* out_data = dev_ctx.template Alloc<bool>(out);
+
+  int num = x.numel();
+  int block = 1024;
+  int grid = (block - 1 + num) / block;
+  grid = (grid > block) ? block : grid;
+#ifdef PADDLE_WITH_HIP
+  hipMemset(out_data, true, sizeof(bool));
+#else
+  cudaMemset(out_data, true, sizeof(bool));
+#endif
+  AllcloseCUDAKernel<T><<<grid, block, 0, dev_ctx.stream()>>>(
+      in_data, other_data, rtol_v, atol_v, equal_nan, num, out_data);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    allclose, GPU, ALL_LAYOUT, phi::AllCloseKernel, float, double) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
+}
diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
new file mode 100644
index 0000000000000..6feee512cc9f4
--- /dev/null
+++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -0,0 +1,278 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/arg_min_max_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+#include <limits>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/phi/core/ddim.h"
+
+namespace phi {
+
+namespace {  // NOLINT
+template <typename K, typename V>
+using KeyValuePair = cub::KeyValuePair<K, V>;
+
+}  // end namespace
+
+#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
+  case (1 << (log2_block_dim)): {                       \
+    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
+    __VA_ARGS__;                                        \
+  } break
+
+#define FIXED_BLOCK_DIM_CASE(...)               \
+  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
+
+template <typename T, typename IndType, class Reducer, size_t BlockDim>
+__global__ void ArgCUDAKernel(const int64_t height,     // n * h
+                              const int64_t width,      // c
+                              const int64_t post_size,  // h
+                              const Reducer reducer,
+                              const T init,
+                              const T* in,
+                              IndType* out) {
+  typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
+    KeyValuePair<int, T> kv_pair = {-1, init};
+    int h = idx / post_size;
+    int w = idx % post_size;
+    for (int k = threadIdx.x; k < width; k += blockDim.x) {
+      kv_pair =
+          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
+    }
+    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
+    if (threadIdx.x == 0) {
+      out[idx] = static_cast<IndType>(kv_pair.key);
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, typename IndType, class Reducer>
+void ComputeFullArg(const phi::GPUContext& dev_ctx,
+                    const DenseTensor& input,
+                    DenseTensor* indices,
+                    const int64_t pre,
+                    const int64_t post,
+                    const int64_t n) {
+  auto cu_stream = dev_ctx.stream();
+  auto ComputeBlockSize = [](int64_t col) {
+    auto block_size = 8;
+    if (col > 512)
+      block_size = 1024;
+    else if (col > 256)
+      block_size = 512;
+    else if (col > 128)
+      block_size = 256;
+    else if (col > 64)
+      block_size = 128;
+    else if (col > 32)
+      block_size = 64;
+    else if (col > 16)
+      block_size = 32;
+    else if (col > 8)
+      block_size = 16;
+#ifdef __HIPCC__
+    block_size = std::min(block_size, 256);
+#endif
+    return block_size;
+  };
+
+  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
+  int64_t height = pre * post;
+  int64_t width = n;
+  int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
+
+  const T* in_data = input.data<T>();
+  IndType* out_data = dev_ctx.template Alloc<IndType>(indices);
+
+  if (typeid(Reducer) == typeid(cub::ArgMax)) {
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgCUDAKernel<T,
+                        IndType,
+                        Reducer,
+                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height,
+              width,
+              post,
+              Reducer(),
+              std::numeric_limits<T>::lowest(),
+              in_data,
+              out_data));
+    }
+  } else {
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgCUDAKernel<T,
+                        IndType,
+                        Reducer,
+                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height,
+              width,
+              post,
+              Reducer(),
+              std::numeric_limits<T>::max(),
+              in_data,
+              out_data));
+    }
+  }
+}
+
+template <typename Context, typename T, class Reducer>
+struct VisitDataCudaArgMinMaxFunctor {
+  const Context& dev_ctx;
+  const DenseTensor& x;
+  int64_t axis;
+  bool keepdims;
+  bool flatten;
+  DenseTensor* out;
+
+  explicit VisitDataCudaArgMinMaxFunctor(const Context& dev_ctx,
+                                         const DenseTensor& x,
+                                         int64_t axis,
+                                         bool keepdims,
+                                         bool flatten,
+                                         DenseTensor* out)
+      : dev_ctx(dev_ctx),
+        x(x),
+        axis(axis),
+        keepdims(keepdims),
+        flatten(flatten),
+        out(out) {}
+
+  template <typename IndType>
+  void apply() const {
+    phi::DDim x_dims;
+    int new_axis = axis;
+    if (flatten) {
+      x_dims = phi::make_ddim({x.numel()});
+      // if flatten, the axis just as 0
+      new_axis = 0;
+    } else {
+      x_dims = x.dims();
+      if (axis < 0) new_axis = axis + x.dims().size();
+    }
+
+    int64_t numel = x.numel();
+    int64_t groups = numel / x_dims[new_axis];
+    int64_t pre = 1;
+    int64_t post = 1;
+    int64_t n = x_dims[new_axis];
+
+    for (int i = 0; i < new_axis; i++) {
+      pre *= x_dims[i];
+    }
+
+    for (int i = new_axis + 1; i < x_dims.size(); i++) {
+      post *= x_dims[i];
+    }
+
+    ComputeFullArg<T, IndType, Reducer>(dev_ctx, x, out, pre, post, n);
+  }
+};
+
+template <typename Context, typename T, class Reducer>
+void ArgMinMaxOpCUDAKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           int64_t axis,
+                           bool keepdims,
+                           bool flatten,
+                           int dtype,
+                           DenseTensor* out) {
+  if (dtype < 0) {
+    paddle::framework::VisitDataTypeTiny(
+        static_cast<paddle::framework::proto::VarType::Type>(
+            paddle::framework::proto::VarType::INT64),
+        VisitDataCudaArgMinMaxFunctor<Context, T, Reducer>(
+            dev_ctx, x, axis, keepdims, flatten, out));
+    return;
+  }
+  paddle::framework::VisitDataTypeTiny(
+      static_cast<paddle::framework::proto::VarType::Type>(dtype),
+      VisitDataCudaArgMinMaxFunctor<Context, T, Reducer>(
+          dev_ctx, x, axis, keepdims, flatten, out));
+}
+
+template <typename T, typename Context>
+void ArgMinKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out) {
+  ArgMinMaxOpCUDAKernel<Context, T, cub::ArgMin>(
+      dev_ctx, x, axis, keepdims, flatten, dtype, out);
+}
+
+template <typename T, typename Context>
+void ArgMaxKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out) {
+  ArgMinMaxOpCUDAKernel<Context, T, cub::ArgMax>(
+      dev_ctx, x, axis, keepdims, flatten, dtype, out);
+}
+
+#endif
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(arg_min,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ArgMinKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {}
+
+PD_REGISTER_KERNEL(arg_max,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ArgMaxKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
new file mode 100644
index 0000000000000..15bca474f58c3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
@@ -0,0 +1,217 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/argsort_kernel.h"
+
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#ifdef __HIPCC__
+namespace rocprim {
+namespace detail {
+template <>
+struct radix_key_codec_base<phi::dtype::float16>
+    : radix_key_codec_integral<phi::dtype::float16, uint16_t> {};
+}  // namespace detail
+}  // namespace rocprim
+#else
+// set cub base traits in order to handle float16
+namespace cub {
+template <>
+struct NumericTraits<phi::dtype::float16>
+    : BaseTraits<FLOATING_POINT, true, false, uint16_t, phi::dtype::float16> {};
+}  // namespace cub
+#endif
+
+namespace phi {
+
+template <typename T, typename IndType>
+static __global__ void FillFlattenGrad(const T* dO,
+                                       const IndType* indices,
+                                       int64_t size,
+                                       T* dX) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = index; i < size; i += stride) {
+    dX[indices[i]] = dO[i];
+  }
+}
+
+template <typename T, typename IndType>
+static __global__ void FillGrad(const T* dO,
+                                const IndType* indices,
+                                T* dX,
+                                IndType num_rows,
+                                IndType num_cols) {
+  int col_id = threadIdx.x;
+  int row_id = blockIdx.x;
+
+  for (IndType j = row_id; j < num_rows; j += gridDim.x) {
+    for (IndType i = col_id; i < num_cols; i += blockDim.x) {
+      dX[j * num_cols + indices[j * num_cols + i]] = dO[j * num_cols + i];
+    }
+  }
+}
+
+template <typename T, typename IndType>
+void ArgFullAssign(const phi::GPUContext& ctx,
+                   const DenseTensor* dO,
+                   const DenseTensor* indices,
+                   DenseTensor* dX,
+                   const IndType num_rows,
+                   const IndType num_cols) {
+  auto cu_stream = ctx.stream();
+
+  auto ComputeBlockSize = [](IndType col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256 && col <= 512)
+      return 512;
+    else if (col > 128 && col <= 256)
+      return 256;
+    else if (col > 64 && col <= 128)
+      return 128;
+    else
+      return 64;
+  };
+
+  int block_size = ComputeBlockSize(num_cols);
+
+  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
+  // actually, int num_rows < max_grid_size
+  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
+  FillGrad<<<grid_size, block_size, 0, cu_stream>>>(dO->data<T>(),
+                                                    indices->data<IndType>(),
+                                                    dX->data<T>(),
+                                                    num_rows,
+                                                    num_cols);
+}
+
+template <typename T>
+void ArgFlattenAssign(const phi::GPUContext& ctx,
+                      const DenseTensor* dO,
+                      const DenseTensor* indices,
+                      int64_t size,
+                      DenseTensor* dX) {
+  auto cu_stream = ctx.stream();
+
+  const int64_t block_size =
+      std::min(size, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock()));
+  int64_t max_threads = ctx.GetMaxPhysicalThreadCount();
+  const int64_t max_blocks =
+      std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+  const int64_t grid_size =
+      std::min(max_blocks, (size + block_size - 1) / block_size);
+
+  FillFlattenGrad<<<grid_size, block_size, 0, cu_stream>>>(
+      dO->data<T>(), indices->data<int64_t>(), size, dX->data<T>());
+}
+
+template <typename T, typename Context>
+void ArgsortGradKernel(const Context& dev_ctx,
+                       const DenseTensor& indices,
+                       const DenseTensor& input,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       bool descending,
+                       DenseTensor* in_grad) {
+  dev_ctx.template Alloc<T>(in_grad);
+  if (out_grad.numel() == 0) return;
+  auto in_dims = in_grad->dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  int64_t size = in_grad->numel();
+
+  // Parallel acceleration when the input size is equal to the length of the
+  // ‘axis’ dimension.
+  // Compared to 'special case for full sort' below, the gradient calculation
+  // is 10 times faster.
+  if (size == in_dims[axis]) {
+    ArgFlattenAssign<T>(dev_ctx, &out_grad, &indices, size, in_grad);
+    return;
+  }
+
+  // Special case for full sort, speedup ~190x.
+  if (axis == -1 || axis + 1 == in_dims.size()) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+    ArgFullAssign<T, int64_t>(
+        dev_ctx, &out_grad, &indices, in_grad, input_height, input_width);
+  } else {
+    // if not full sort, do transpose first
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(axis);
+    phi::DDim trans_dims(in_dims);
+    for (int i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+
+    DenseTensor trans_dO;
+    trans_dO.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_dO);
+    DenseTensor trans_ind;
+    trans_ind.Resize(trans_dims);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    TransposeKernel<T, Context>(dev_ctx, out_grad, trans, &trans_dO);
+    TransposeKernel<int64_t, Context>(dev_ctx, indices, trans, &trans_ind);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&tmp_out);
+
+    ArgFullAssign<T, int64_t>(
+        dev_ctx, &trans_dO, &trans_ind, &tmp_out, input_height, input_width);
+
+    // transpose back
+    TransposeKernel<T, Context>(dev_ctx, tmp_out, trans, in_grad);
+    return;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(argsort_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ArgsortGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/argsort_kernel.cu b/paddle/phi/kernels/gpu/argsort_kernel.cu
new file mode 100644
index 0000000000000..6a9c1e275998b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/argsort_kernel.cu
@@ -0,0 +1,310 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/argsort_kernel.h"
+
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#ifdef __HIPCC__
+namespace rocprim {
+namespace detail {
+template <>
+struct radix_key_codec_base<phi::dtype::float16>
+    : radix_key_codec_integral<phi::dtype::float16, uint16_t> {};
+}  // namespace detail
+}  // namespace rocprim
+#else
+// set cub base traits in order to handle float16
+namespace cub {
+template <>
+struct NumericTraits<phi::dtype::float16>
+    : BaseTraits<FLOATING_POINT, true, false, uint16_t, phi::dtype::float16> {};
+}  // namespace cub
+#endif
+
+namespace phi {
+
+// Iter for move to next row
+struct SegmentOffsetIter {
+  EIGEN_DEVICE_FUNC
+  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
+    return idx * num_cols_;
+  }
+
+  int num_cols_;
+};
+
+template <typename T>
+static __global__ void FillIndex(T* indices, T num_rows, T num_cols) {
+  int col_id = threadIdx.x;
+  int row_id = blockIdx.x;
+
+  for (T j = row_id; j < num_rows; j += gridDim.x) {
+    for (T i = col_id; i < num_cols; i += blockDim.x) {
+      indices[j * num_cols + i] = i;
+    }
+  }
+}
+
+// Sort by flag descending, True: descending. False: Ascending.
+// Default is false.
+template <typename T, typename IndType>
+void ArgFullSort(const phi::GPUContext& ctx,
+                 const DenseTensor* input,
+                 DenseTensor* output,
+                 DenseTensor* indices,
+                 const IndType num_rows,
+                 const IndType num_cols,
+                 const bool descending) {
+  auto cu_stream = ctx.stream();
+  DenseTensor input_indices;
+  const std::vector<IndType> dims = {num_rows, num_cols};
+  auto dim = phi::make_ddim(dims);
+  input_indices.Resize(dim);
+  ctx.template Alloc<IndType>(&input_indices);
+  size_t temp_storage_bytes = -1;
+
+  auto ComputeBlockSize = [](IndType col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256 && col <= 512)
+      return 512;
+    else if (col > 128 && col <= 256)
+      return 256;
+    else if (col > 64 && col <= 128)
+      return 128;
+    else
+      return 64;
+  };
+
+  int block_size = ComputeBlockSize(num_cols);
+  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
+  // actually, int num_rows < max_grid_size
+  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
+  // Init a index array
+  FillIndex<<<grid_size, block_size, 0, cu_stream>>>(
+      input_indices.data<IndType>(), num_rows, num_cols);
+
+  T* sorted_out_ptr;
+  IndType* sorted_indices_ptr;
+  const T* inp = input->data<T>();
+  T* out = ctx.template Alloc<T>(output);
+  IndType* ind = ctx.template Alloc<IndType>(indices);
+  sorted_out_ptr = out;
+  sorted_indices_ptr = ind;
+
+  // create iter for counting input
+  cub::CountingInputIterator<IndType> counting_iter(0);
+  // segment_offset is used for move to next row
+  cub::TransformInputIterator<IndType,
+                              SegmentOffsetIter,
+                              cub::CountingInputIterator<IndType>>
+      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
+
+  gpuError_t err;
+  if (descending) {
+    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        nullptr,
+        temp_storage_bytes,
+        inp,
+        sorted_out_ptr,
+        input_indices.data<IndType>(),
+        sorted_indices_ptr,
+        num_cols * num_rows,
+        num_rows,
+        segment_offsets_t,
+        segment_offsets_t + 1,
+        0,
+        sizeof(T) * 8,
+        cu_stream);
+  } else {
+    err =
+        cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
+                                                 temp_storage_bytes,
+                                                 inp,
+                                                 sorted_out_ptr,
+                                                 input_indices.data<IndType>(),
+                                                 sorted_indices_ptr,
+                                                 num_cols * num_rows,
+                                                 num_rows,
+                                                 segment_offsets_t,
+                                                 segment_offsets_t + 1,
+                                                 0,
+                                                 sizeof(T) * 8,
+                                                 cu_stream);
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(err);
+
+  DenseTensor temp_storage;
+  int64_t temp_size = temp_storage_bytes;
+  temp_storage.Resize({temp_size});
+  ctx.template Alloc<uint8_t>(&temp_storage);
+
+  if (descending) {
+    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        temp_storage.data<uint8_t>(),
+        temp_storage_bytes,
+        inp,
+        sorted_out_ptr,
+        input_indices.data<IndType>(),
+        sorted_indices_ptr,
+        num_cols * num_rows,
+        num_rows,
+        segment_offsets_t,
+        segment_offsets_t + 1,
+        0,
+        sizeof(T) * 8,
+        cu_stream);
+  } else {
+    err =
+        cub::DeviceSegmentedRadixSort::SortPairs(temp_storage.data<uint8_t>(),
+                                                 temp_storage_bytes,
+                                                 inp,
+                                                 sorted_out_ptr,
+                                                 input_indices.data<IndType>(),
+                                                 sorted_indices_ptr,
+                                                 num_cols * num_rows,
+                                                 num_rows,
+                                                 segment_offsets_t,
+                                                 segment_offsets_t + 1,
+                                                 0,
+                                                 sizeof(T) * 8,
+                                                 cu_stream);
+  }
+
+  PADDLE_ENFORCE_GPU_SUCCESS(err);
+}
+
+template <typename T, typename Context>
+void ArgsortKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   int axis,
+                   bool descending,
+                   DenseTensor* output,
+                   DenseTensor* indices) {
+  auto in_dims = input.dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  const T* in_data = input.data<T>();
+  auto size = input.numel();
+  T* out_data = dev_ctx.template Alloc<T>(output);
+  int64_t* ids_data = dev_ctx.template Alloc<int64_t>(indices);
+
+  // Use thrust for parallel acceleration when the input size is equal to the
+  // length of the ‘axis’ dimension.
+  // Compared to the following 'Special case for full sort', ascending sort is
+  // 34 times faster and descending sort is 31 times faster.
+  if (size == in_dims[axis]) {
+    thrust::sequence(thrust::device, ids_data, ids_data + size);
+    thrust::copy(thrust::device, in_data, in_data + size, out_data);
+    thrust::sort_by_key(thrust::device, out_data, out_data + size, ids_data);
+    if (descending) {
+      thrust::reverse(thrust::device, out_data, out_data + size);
+      thrust::reverse(thrust::device, ids_data, ids_data + size);
+    }
+    return;
+  }
+
+  // Special case for full sort, speedup ~190x.
+  if (axis == -1 || axis + 1 == in_dims.size()) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+    ArgFullSort<T, int64_t>(dev_ctx,
+                            &input,
+                            output,
+                            indices,
+                            input_height,
+                            input_width,
+                            descending);
+  } else {
+    // if not full sort, do transpose first
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(axis);
+    phi::DDim trans_dims(in_dims);
+    for (int i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+
+    DenseTensor trans_inp;
+    trans_inp.Resize(trans_dims);
+    T* trans_inp_data = dev_ctx.template Alloc<T>(&trans_inp);
+    // Do transpose
+    TransposeKernel<T, Context>(dev_ctx, input, trans, &trans_inp);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&tmp_out);
+
+    DenseTensor tmp_indices;
+    // temp indices for sorting
+    tmp_indices.Resize(trans_dims);
+    dev_ctx.template Alloc<int64_t>(&tmp_indices);
+    dev_ctx.template Alloc<int64_t>(indices);
+
+    ArgFullSort<T, int64_t>(dev_ctx,
+                            &trans_inp,
+                            &tmp_out,
+                            &tmp_indices,
+                            input_height,
+                            input_width,
+                            descending);
+
+    TransposeKernel<int64_t, Context>(dev_ctx, tmp_indices, trans, indices);
+    // transpose back
+    TransposeKernel<T, Context>(dev_ctx, tmp_out, trans, output);
+    return;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(argsort,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ArgsortKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
new file mode 100644
index 0000000000000..82b1282cc36dc
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_WITH_HIP
+// backward reuse forward, HIP not support forward
+
+#include "paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(cholesky_solve_grad,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CholeskySolveGradKernel,
+                   float,
+                   double) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
new file mode 100644
index 0000000000000..f1c91f3824780
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
@@ -0,0 +1,141 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h"
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+
+namespace phi {
+
+template <typename T>
+void cusolver_potrs(const solverHandle_t &handle,
+                    cublasFillMode_t uplo,
+                    int M,
+                    int N,
+                    T *Adata,
+                    int lda,
+                    T *Bdata,
+                    int ldb,
+                    int *devInfo);
+
+template <>
+void cusolver_potrs<float>(const solverHandle_t &handle,
+                           cublasFillMode_t uplo,
+                           int M,
+                           int N,
+                           float *Adata,
+                           int lda,
+                           float *Bdata,
+                           int ldb,
+                           int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSpotrs(
+      handle, uplo, M, N, Adata, lda, Bdata, ldb, devInfo));
+}
+
+template <>
+void cusolver_potrs<double>(const solverHandle_t &handle,
+                            cublasFillMode_t uplo,
+                            int M,
+                            int N,
+                            double *Adata,
+                            int lda,
+                            double *Bdata,
+                            int ldb,
+                            int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDpotrs(
+      handle, uplo, M, N, Adata, lda, Bdata, ldb, devInfo));
+}
+
+template <>
+void cusolver_potrs<phi::dtype::complex<float>>(
+    const solverHandle_t &handle,
+    cublasFillMode_t uplo,
+    int M,
+    int N,
+    phi::dtype::complex<float> *Adata,
+    int lda,
+    phi::dtype::complex<float> *Bdata,
+    int ldb,
+    int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnCpotrs(handle,
+                                uplo,
+                                M,
+                                N,
+                                reinterpret_cast<const cuComplex *>(Adata),
+                                lda,
+                                reinterpret_cast<cuComplex *>(Bdata),
+                                ldb,
+                                devInfo));
+}
+
+template <>
+void cusolver_potrs<phi::dtype::complex<double>>(
+    const cusolverDnHandle_t &handle,
+    cublasFillMode_t uplo,
+    int M,
+    int N,
+    phi::dtype::complex<double> *Adata,
+    int lda,
+    phi::dtype::complex<double> *Bdata,
+    int ldb,
+    int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnZpotrs(
+      handle,
+      uplo,
+      M,
+      N,
+      reinterpret_cast<const cuDoubleComplex *>(Adata),
+      lda,
+      reinterpret_cast<cuDoubleComplex *>(Bdata),
+      ldb,
+      devInfo));
+}
+
+template <typename T>
+class CholeskySolveFunctor<T, GPUContext> {
+ public:
+  void operator()(const GPUContext &dev_ctx,
+                  bool upper,
+                  int M,
+                  int N,
+                  T *Adata,
+                  int lda,
+                  T *Bdata,
+                  int *devInfo) {
+    cublasFillMode_t uplo =
+        upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
+    auto handle = dev_ctx.cusolver_dn_handle();
+    cusolver_potrs<T>(handle, uplo, M, N, Adata, lda, Bdata, lda, devInfo);
+  }
+};
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cholesky_solve,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CholeskySolveKernel,
+                   float,
+                   double) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu
new file mode 100644
index 0000000000000..e583e13650aeb
--- /dev/null
+++ b/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu
@@ -0,0 +1,157 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h"
+
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/depthwise_conv.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Conv2dTransposeDoubleGradKernel(const Context& ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& filter,
+                                     const DenseTensor& dout,
+                                     const DenseTensor& ddx,
+                                     const DenseTensor& ddfilter,
+                                     const std::vector<int>& strides,
+                                     const std::vector<int>& paddings,
+                                     const std::vector<int>& output_padding,
+                                     const std::vector<int>& output_size,
+                                     const std::string& padding_algorithm,
+                                     int groups,
+                                     const std::vector<int>& dilations,
+                                     const std::string& data_format,
+                                     DenseTensor* dx,
+                                     DenseTensor* dfilter,
+                                     DenseTensor* ddout) {
+  ConvTransposeGradRawKernel<T, Context>(ctx,
+                                         x,
+                                         filter,
+                                         dout,
+                                         strides,
+                                         paddings,
+                                         padding_algorithm,
+                                         groups,
+                                         dilations,
+                                         data_format,
+                                         dx,
+                                         dfilter);
+}
+
+template <typename T, typename Context>
+void DepthwiseConv2dTransposeGradKernel(const Context& ctx,
+                                        const DenseTensor& x,
+                                        const DenseTensor& filter,
+                                        const DenseTensor& dout,
+                                        const std::vector<int>& strides,
+                                        const std::vector<int>& paddings,
+                                        const std::vector<int>& output_padding,
+                                        const std::vector<int>& output_size,
+                                        const std::string& padding_algorithm,
+                                        int groups,
+                                        const std::vector<int>& dilations,
+                                        const std::string& data_format,
+                                        DenseTensor* dx,
+                                        DenseTensor* dfilter) {
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_format);
+  DenseTensor filter_ = filter;
+
+  if (!dx && !dfilter) {
+    return;
+  }
+
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ = dilations;
+
+  auto x_dims = x.dims();
+  auto filter_dims = filter_.dims();
+
+  DDim in_data_dims;
+  if (data_layout != DataLayout::kNHWC) {
+    in_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  } else {
+    in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  }
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
+
+  if (dx) {
+    paddle::operators::math::DepthwiseConvFunctor<Context, T> depthwiseConv;
+    depthwiseConv(ctx,
+                  dout,
+                  filter_,
+                  strides,
+                  std::vector<int>{
+                      paddings_[0], paddings_[2], paddings_[1], paddings_[3]},
+                  dilations_,
+                  dx,
+                  data_layout);
+  }
+
+  if (dfilter) {
+    funcs::SetConstant<Context, T> set_zero;
+    ctx.template Alloc<T>(dfilter);
+    set_zero(ctx, dfilter, static_cast<T>(0));
+
+    paddle::operators::math::DepthwiseConvFilterGradFunctor<Context, T>
+        depthwiseConvFilterGrad;
+    depthwiseConvFilterGrad(
+        ctx,
+        dout,
+        x,
+        strides,
+        std::vector<int>{
+            paddings_[0], paddings_[2], paddings_[1], paddings_[3]},
+        dilations_,
+        dfilter,
+        data_layout);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(conv2d_transpose_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(conv2d_transpose_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeDoubleGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(conv3d_transpose_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Conv3dTransposeGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(depthwise_conv2d_transpose_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConv2dTransposeGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/conv_transpose_kernel.cu b/paddle/phi/kernels/gpu/conv_transpose_kernel.cu
new file mode 100644
index 0000000000000..b7d34a5baf3df
--- /dev/null
+++ b/paddle/phi/kernels/gpu/conv_transpose_kernel.cu
@@ -0,0 +1,118 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_transpose_kernel.h"
+#include "paddle/phi/kernels/impl/conv_transpose_kernel_impl.h"
+
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/depthwise_conv.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DepthwiseConv2dTransposeKernel(const Context& ctx,
+                                    const DenseTensor& x,
+                                    const DenseTensor& filter,
+                                    const std::vector<int>& strides,
+                                    const std::vector<int>& paddings,
+                                    const std::vector<int>& output_padding,
+                                    const std::vector<int>& output_size,
+                                    const std::string& padding_algorithm,
+                                    int groups,
+                                    const std::vector<int>& dilations,
+                                    const std::string& data_format,
+                                    DenseTensor* out) {
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_format);
+  DenseTensor filter_ = filter;
+  ctx.template Alloc<T>(out);
+
+  PADDLE_ENFORCE_EQ(
+      groups,
+      filter_.dims()[0],
+      errors::InvalidArgument(
+          "groups should be error to the 1st dimension of filter_. But "
+          "received groups is %d and filter dimension[0] is %d",
+          groups,
+          filter_.dims()[0]));
+
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ = dilations;
+
+  for (auto v : dilations_) {
+    PADDLE_ENFORCE_EQ(
+        v,
+        1,
+        errors::InvalidArgument("dilations should be 1 in depthwise conv. "
+                                "But received dilations is %d",
+                                v));
+  }
+
+  auto x_dims = x.dims();
+  auto filter_dims = filter_.dims();
+
+  DDim in_data_dims;
+  if (data_layout != DataLayout::kNHWC) {
+    in_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  } else {
+    in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  }
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
+
+  ctx.template Alloc<T>(out);
+
+  funcs::SetConstant<Context, T> set_zero;
+  set_zero(ctx, out, static_cast<T>(0));
+
+  paddle::operators::math::DepthwiseConvInputGradFunctor<Context, T>
+      depthwiseConvInputGrad;
+  depthwiseConvInputGrad(
+      ctx,
+      *out,
+      filter,
+      x,
+      strides,
+      std::vector<int>{paddings_[0], paddings_[2], paddings_[1], paddings_[3]},
+      dilations_,
+      out,
+      data_layout);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(conv2d_transpose,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(conv3d_transpose,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Conv3dTransposeKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(depthwise_conv2d_transpose,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConv2dTransposeKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
new file mode 100644
index 0000000000000..6e8712462928d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
@@ -0,0 +1,320 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumprod_grad_kernel.h"
+
+#include <thrust/transform.h>
+#include "paddle/fluid/operators/math/inclusive_scan.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/cumprod.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+// NOTE(@xiongkun): use of IsComplex<>
+#include "paddle/fluid/framework/data_type.h"
+
+namespace phi {
+
+template <typename T>
+struct CumprodGradFunctorExceptFirstZero {
+  HOSTDEVICE CumprodGradFunctorExceptFirstZero(
+      const T *x,
+      const T *y,
+      const T *dy_mul_y_reversed_cumsum,
+      const uint8_t *zero_mask,
+      size_t mid_dim,
+      size_t inner_dim,
+      T *dx,
+      int64_t *first_zero_idx,
+      T *x_filled_one)
+      : x_(x),
+        y_(y),
+        dy_mul_y_reversed_cumsum_(dy_mul_y_reversed_cumsum),
+        zero_mask_(zero_mask),
+        mid_dim_(mid_dim),
+        inner_dim_(inner_dim),
+        dx_(dx),
+        first_zero_idx_(first_zero_idx),
+        x_filled_one_(x_filled_one) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    auto inner_idx = idx % inner_dim_;
+    auto outer_idx = idx / (mid_dim_ * inner_dim_);
+    auto mid_idx = (idx - inner_idx) / inner_dim_ % mid_dim_;
+    auto mask = zero_mask_[idx];
+    bool should_fill_one = true;
+
+    if (mask == 0) {
+      dx_[idx] = dy_mul_y_reversed_cumsum_[idx] / x_[idx];
+      if (mid_idx == mid_dim_ - 1) {
+        // record first zero position as -1, i.e., no zero
+        first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = -1;
+      }
+    } else if (mid_idx > 0) {                  // mask > 0
+      if (zero_mask_[idx - inner_dim_] > 0) {  // not first zero
+        dx_[idx] = 0;
+        should_fill_one = false;
+      } else {
+        // idx is the first zero position, it should be recorded
+        dx_[idx] = y_[idx - inner_dim_];
+        first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = mid_idx;
+      }
+    } else {  // the first zero position is index 0
+      dx_[idx] = 1;
+      first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = 0;
+    }
+
+    x_filled_one_[idx] = should_fill_one ? 1 : x_[idx];
+  }
+
+ private:
+  const T *x_;
+  const T *y_;
+  const T *dy_mul_y_reversed_cumsum_;
+  const uint8_t *zero_mask_;
+  size_t mid_dim_;
+  size_t inner_dim_;
+  T *dx_;
+  int64_t *first_zero_idx_;
+  T *x_filled_one_;
+};
+
+template <typename T>
+struct FillFirstZeroPositionGradFunctor {
+  HOSTDEVICE FillFirstZeroPositionGradFunctor(const int64_t *first_zero_idx,
+                                              const T *grad_value,
+                                              size_t mid_dim,
+                                              size_t inner_dim,
+                                              T *dx)
+      : first_zero_idx_(first_zero_idx),
+        grad_value_(grad_value),
+        mid_dim_(mid_dim),
+        inner_dim_(inner_dim),
+        dx_(dx) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    auto outer_idx = idx / inner_dim_;
+    auto inner_idx = idx % inner_dim_;
+    auto mid_idx = first_zero_idx_[idx];
+    if (mid_idx >= 0) {
+      auto full_idx =
+          outer_idx * mid_dim_ * inner_dim_ + mid_idx * inner_dim_ + inner_idx;
+      dx_[full_idx] *= grad_value_[full_idx];
+    }
+  }
+
+ private:
+  const int64_t *first_zero_idx_;
+  const T *grad_value_;
+  size_t mid_dim_;
+  size_t inner_dim_;
+  T *dx_;
+};
+
+template <typename T, typename Context>
+void CumprodGradKernel(const Context &dev_ctx,
+                       const DenseTensor &x,
+                       const DenseTensor &out,
+                       const DenseTensor &dout,
+                       int dim,
+                       DenseTensor *dx) {
+  const auto *y = &out;
+  const auto *dy = &dout;
+
+  size_t outer_dim, mid_dim, inner_dim;
+  GetCumprodDimInfo(x.dims(), dim, &outer_dim, &mid_dim, &inner_dim);
+  if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return;
+
+  size_t numel = outer_dim * mid_dim * inner_dim;
+
+  const auto *x_data = x.data<T>();
+  const auto *y_data = y->data<T>();
+  const auto *dy_data = dy->data<T>();
+
+  auto place = dev_ctx.GetPlace();
+  auto *dx_data = dev_ctx.template Alloc<T>(dx);
+
+  // deal with complex
+  const T *x_data_deal;
+  const T *y_data_deal;
+  Allocator::AllocationPtr x_conj;
+  Allocator::AllocationPtr y_conj;
+  if (paddle::framework::IsComplex<T>::value) {
+    x_conj = const_cast<Allocator &>(dev_ctx.GetAllocator())
+                 .Allocate(numel * sizeof(T));
+    auto *x_data_conj = reinterpret_cast<T *>(x_conj->ptr());
+    y_conj = const_cast<Allocator &>(dev_ctx.GetAllocator())
+                 .Allocate(numel * sizeof(T));
+    auto *y_data_conj = reinterpret_cast<T *>(y_conj->ptr());
+
+    phi::funcs::ForRange<Context> for_range_x(dev_ctx, numel);
+    phi::funcs::ConjFunctor<T> functor_x(x_data, numel, x_data_conj);
+    for_range_x(functor_x);
+
+    phi::funcs::ForRange<Context> for_range_y(dev_ctx, numel);
+    phi::funcs::ConjFunctor<T> functor_y(y_data, numel, y_data_conj);
+    for_range_y(functor_y);
+    x_data_deal = x_data_conj;
+    y_data_deal = y_data_conj;
+  } else {
+    x_data_deal = x_data;
+    y_data_deal = y_data;
+  }
+
+// Step 1: find cummax-ed zero mask of x
+#ifdef PADDLE_WITH_CUDA
+  const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream());
+#else
+  const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream());
+#endif
+  auto zero_mask_without_cummax =
+      const_cast<Allocator &>(dev_ctx.GetAllocator())
+          .Allocate(numel * sizeof(uint8_t));
+  auto *zero_mask_without_cummax_data =
+      reinterpret_cast<uint8_t *>(zero_mask_without_cummax->ptr());
+  thrust::transform(exec_policy,
+                    thrust::device_pointer_cast(x_data_deal),
+                    thrust::device_pointer_cast(x_data_deal) + numel,
+                    thrust::device_pointer_cast(zero_mask_without_cummax_data),
+                    funcs::IsZeroFunctor<T>());
+
+  auto zero_mask = const_cast<Allocator &>(dev_ctx.GetAllocator())
+                       .Allocate(numel * sizeof(uint8_t));
+  auto *zero_mask_data = reinterpret_cast<uint8_t *>(zero_mask->ptr());
+  paddle::operators::math::InclusiveScan<uint8_t, cub::Max>(
+      zero_mask_without_cummax_data,
+      zero_mask_data,
+      outer_dim,
+      mid_dim,
+      inner_dim,
+      static_cast<uint8_t>(0),
+      cub::Max(),
+      /*reverse=*/false,
+      dev_ctx);
+  zero_mask_without_cummax = nullptr;
+
+  // Step 2: calculate reversed cumsum(dy * y)
+  auto dy_mul_y = const_cast<Allocator &>(dev_ctx.GetAllocator())
+                      .Allocate(numel * sizeof(T));
+  auto *dy_mul_y_data = reinterpret_cast<T *>(dy_mul_y->ptr());
+  thrust::transform(exec_policy,
+                    thrust::device_pointer_cast(dy_data),
+                    thrust::device_pointer_cast(dy_data) + numel,
+                    thrust::device_pointer_cast(y_data_deal),
+                    thrust::device_pointer_cast(dy_mul_y_data),
+                    funcs::MultiplyFunctor<T>());
+
+  auto dy_mul_y_reversed_cumsum =
+      const_cast<Allocator &>(dev_ctx.GetAllocator())
+          .Allocate(numel * sizeof(T));
+  auto *dy_mul_y_reversed_cumsum_data =
+      reinterpret_cast<T *>(dy_mul_y_reversed_cumsum->ptr());
+  paddle::operators::math::InclusiveScan<T, cub::Sum>(
+      dy_mul_y_data,
+      dy_mul_y_reversed_cumsum_data,
+      outer_dim,
+      mid_dim,
+      inner_dim,
+      static_cast<T>(0),
+      cub::Sum(),
+      /*reverse=*/true,
+      dev_ctx);
+
+  // Step 3: calculate the gradient value except the first zero position.
+  // The gradient value of the first zero position is filled with out[idx-1],
+  // while the gradient value of the other positions are calculated out
+  // completely. This functor also:
+  //  (1) find the first zero index, i.e., first_zero_idx_data.
+  //  (2) fill x_filled_one, which satifies
+  //      x_filled_one[i] = x[i], i > pos
+  //      x_filled_one[i] = 1, i <= pos
+  auto first_zero_idx = const_cast<Allocator &>(dev_ctx.GetAllocator())
+                            .Allocate(numel * sizeof(int64_t));
+  auto *first_zero_idx_data =
+      reinterpret_cast<int64_t *>(first_zero_idx->ptr());
+  auto *x_filled_one_data = dy_mul_y_data;  // reuse former allocated memory
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  CumprodGradFunctorExceptFirstZero<T> functor_except_first_zero(
+      x_data_deal,
+      y_data_deal,
+      dy_mul_y_reversed_cumsum_data,
+      zero_mask_data,
+      mid_dim,
+      inner_dim,
+      dx_data,
+      first_zero_idx_data,
+      x_filled_one_data);
+  for_range(functor_except_first_zero);
+
+  // Step 4: calculate cumprod of x_filled_one
+  auto *x_filled_one_cumprod_data =
+      dy_mul_y_reversed_cumsum_data;  // reuse former allocated memory
+  paddle::operators::math::InclusiveScan<T, funcs::MultiplyFunctor<T>>(
+      x_filled_one_data,
+      x_filled_one_cumprod_data,
+      outer_dim,
+      mid_dim,
+      inner_dim,
+      static_cast<T>(1),
+      funcs::MultiplyFunctor<T>(),
+      /*reverse=*/false,
+      dev_ctx);
+
+  // Step 5: calculate reversed cumsum(dy * x_filled_one_cumprod)
+  auto *dy_mul_x_filled_one_cumprod =
+      dy_mul_y_data;  // reuse former allocated memory
+  thrust::transform(exec_policy,
+                    thrust::device_pointer_cast(dy_data),
+                    thrust::device_pointer_cast(dy_data) + numel,
+                    thrust::device_pointer_cast(x_filled_one_cumprod_data),
+                    thrust::device_pointer_cast(dy_mul_x_filled_one_cumprod),
+                    funcs::MultiplyFunctor<T>());
+  auto *dy_mul_x_filled_one_cumprod_reversed_cumsum =
+      dy_mul_y_reversed_cumsum_data;  // reuse former allocated memory
+  paddle::operators::math::InclusiveScan<T, cub::Sum>(
+      dy_mul_x_filled_one_cumprod,
+      dy_mul_x_filled_one_cumprod_reversed_cumsum,
+      outer_dim,
+      mid_dim,
+      inner_dim,
+      static_cast<T>(0),
+      cub::Sum(),
+      /*reverse=*/true,
+      dev_ctx);
+
+  // Step 6: fill zero pos gradient value
+  phi::funcs::ForRange<Context> for_range_fill_zero_pos_grad(
+      dev_ctx, outer_dim * inner_dim);
+  FillFirstZeroPositionGradFunctor<T> fill_first_zero_pos_grad_functor(
+      first_zero_idx_data,
+      dy_mul_x_filled_one_cumprod_reversed_cumsum,
+      mid_dim,
+      inner_dim,
+      dx_data);
+  for_range_fill_zero_pos_grad(fill_first_zero_pos_grad_functor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cumprod_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CumprodGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/cumprod_kernel.cu b/paddle/phi/kernels/gpu/cumprod_kernel.cu
new file mode 100644
index 0000000000000..1bbf8972a2479
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cumprod_kernel.cu
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumprod_kernel.h"
+
+#include "paddle/fluid/operators/math/inclusive_scan.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/cumprod.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CumprodKernel(const Context &dev_ctx,
+                   const DenseTensor &input,
+                   int dim,
+                   DenseTensor *out) {
+  const auto *x = &input;
+  auto *y = out;
+  size_t outer_dim, mid_dim, inner_dim;
+  GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim);
+
+  const auto *x_data = x->data<T>();
+  auto *y_data = dev_ctx.template Alloc<T>(y);
+  paddle::operators::math::InclusiveScan(x_data,
+                                         y_data,
+                                         outer_dim,
+                                         mid_dim,
+                                         inner_dim,
+                                         static_cast<T>(1),
+                                         funcs::MultiplyFunctor<T>(),
+                                         /*reverse=*/false,
+                                         dev_ctx);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cumprod,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CumprodKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
new file mode 100644
index 0000000000000..1db6e1b7cf733
--- /dev/null
+++ b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
@@ -0,0 +1,160 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/deformable_conv_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+
+template <typename T>
+__global__ void ModulatedDeformableIm2colGpuKernel(
+    const int nthreads,
+    const T* data_im,
+    const T* data_offset,
+    const T* data_mask,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int num_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    T* data_col) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    const int w_col = i % width_col;
+    const int h_col = (i / width_col) % height_col;
+    const int b_col = (i / width_col) / height_col % batch_size;
+    const int c_im = (i / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    T* data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T* data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T* data_offset_ptr =
+        data_offset +
+        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const T* data_mask_ptr =
+        data_mask +
+        (b_col * deformable_group + deformable_group_index) * kernel_h *
+            kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
+          val =
+              DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ModulatedDeformableIm2col(const Context& dev_ctx,
+                               const T* data_im,
+                               const T* data_offset,
+                               const T* data_mask,
+                               const std::vector<int64_t>& im_shape,
+                               const std::vector<int64_t>& col_shape,
+                               const std::vector<int64_t>& filter_shape,
+                               const std::vector<int>& paddings,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& dilations,
+                               const int deformable_groups,
+                               T* data_col) {
+  int channel_per_deformable_group = im_shape[0] / deformable_groups;
+  int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
+
+  int blocks = NumBlocks(num_kernels);
+  int threads = kNumCUDAThreads;
+
+  ModulatedDeformableIm2colGpuKernel<
+      T><<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
+                                                   data_im,
+                                                   data_offset,
+                                                   data_mask,
+                                                   im_shape[1],
+                                                   im_shape[2],
+                                                   filter_shape[2],
+                                                   filter_shape[3],
+                                                   paddings[0],
+                                                   paddings[1],
+                                                   strides[0],
+                                                   strides[1],
+                                                   dilations[0],
+                                                   dilations[1],
+                                                   channel_per_deformable_group,
+                                                   col_shape[1],
+                                                   im_shape[0],
+                                                   deformable_groups,
+                                                   col_shape[2],
+                                                   col_shape[3],
+                                                   data_col);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(deformable_conv,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DeformableConvKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/determinant_grad_kernel.cu b/paddle/phi/kernels/gpu/determinant_grad_kernel.cu
new file mode 100644
index 0000000000000..cce12a87fac72
--- /dev/null
+++ b/paddle/phi/kernels/gpu/determinant_grad_kernel.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/determinant_grad_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(determinant_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DeterminantGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/determinant_kernel.cu b/paddle/phi/kernels/gpu/determinant_kernel.cu
new file mode 100644
index 0000000000000..2518408387395
--- /dev/null
+++ b/paddle/phi/kernels/gpu/determinant_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/determinant_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/determinant_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    determinant, GPU, ALL_LAYOUT, phi::DeterminantKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/diag_grad_kernel.cu b/paddle/phi/kernels/gpu/diag_grad_kernel.cu
new file mode 100644
index 0000000000000..65bf837e6cf8a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/diag_grad_kernel.cu
@@ -0,0 +1,139 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/diag_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+// Extract the diagonal of a matrix 'dout' to a matrix 'dx'
+template <typename T>
+__global__ void ExtractDiagonalKernel(const T* dout,
+                                      T* dx,
+                                      std::ptrdiff_t start,
+                                      std::ptrdiff_t dx_length,
+                                      const std::ptrdiff_t sumStride,
+                                      const std::ptrdiff_t xStride) {
+  for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+       idx < dx_length;
+       idx += gridDim.x * blockDim.x) {
+    const std::ptrdiff_t outOffset = start + sumStride * idx;
+    dx[xStride * idx] = dout[outOffset];
+  }
+}
+
+// Paste a vector 'dout' to the diagonal of a matrix 'dx'
+template <typename T>
+__global__ void PasteDiagonalKernel(const T* dout,
+                                    T* dx,
+                                    std::ptrdiff_t start,
+                                    std::ptrdiff_t size,
+                                    const std::ptrdiff_t sumStride,
+                                    const std::ptrdiff_t outStride) {
+  for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    std::ptrdiff_t xOffset = start + sumStride * idx;
+    dx[xOffset] = dout[outStride * idx];
+  }
+}
+
+template <typename T, typename Context>
+void DiagGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    int offset,
+                    DenseTensor* x_grad) {
+  T* dx_data = dev_ctx.template Alloc<T>(x_grad);
+  auto* dout_data = out_grad.data<T>();
+  auto dx_dims = x_grad->dims();
+  auto dout_dims = out_grad.dims();
+
+  auto GetBlockGridSize = [&dev_ctx](int64_t size) {
+    const int64_t block_size =
+        std::min(size, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock()));
+    int64_t max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    const int64_t max_blocks =
+        std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+    const int64_t grid_size =
+        std::min(max_blocks, (size + block_size - 1) / block_size);
+    return std::tuple<int64_t, int64_t>{block_size, grid_size};
+  };
+
+  if (dx_dims.size() == 1) {
+    auto dx_length = dx_dims[0];
+    auto size = (offset > 0) ? dx_length + offset : dx_length - offset;
+    int dx_stride = phi::funcs::ComputeStride(0, dx_dims);
+    if (size > 0) {
+      auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims);
+      auto dout_stride_1 = phi::funcs::ComputeStride(1, dout_dims);
+      auto start =
+          (offset >= 0 ? offset * dout_stride_1 : -offset * dout_stride_0);
+
+      std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
+      ExtractDiagonalKernel<T><<<std::get<1>(block_grid_size),
+                                 std::get<0>(block_grid_size),
+                                 0,
+                                 dev_ctx.stream()>>>(
+          dout_data,
+          dx_data,
+          start,
+          dx_length,
+          dout_stride_0 + dout_stride_1,
+          dx_stride);
+    }
+  } else {
+    phi::funcs::SetConstant<Context, T> set_padding_value;
+    set_padding_value(dev_ctx, x_grad, static_cast<T>(0));
+
+    int dx_stride_0 = phi::funcs::ComputeStride(0, dx_dims);
+    int dx_stride_1 = phi::funcs::ComputeStride(1, dx_dims);
+    int64_t size;
+    if (offset > 0) {
+      size = std::min(dx_dims[0], dx_dims[1] - offset);
+    } else {
+      size = std::min(dx_dims[0] + offset, dx_dims[1]);
+    }
+
+    if (size > 0) {
+      auto start = (offset >= 0 ? offset * dx_stride_1 : -offset * dx_stride_0);
+      auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims);
+      std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
+      PasteDiagonalKernel<T><<<std::get<1>(block_grid_size),
+                               std::get<0>(block_grid_size),
+                               0,
+                               dev_ctx.stream()>>>(dout_data,
+                                                   dx_data,
+                                                   start,
+                                                   size,
+                                                   dx_stride_0 + dx_stride_1,
+                                                   dout_stride_0);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(diag_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DiagGradKernel,
+                   phi::dtype::float16,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/diag_kernel.cu b/paddle/phi/kernels/gpu/diag_kernel.cu
index fc70639787173..95d3d3365d91b 100644
--- a/paddle/phi/kernels/gpu/diag_kernel.cu
+++ b/paddle/phi/kernels/gpu/diag_kernel.cu
@@ -130,5 +130,12 @@ void DiagKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    diag, GPU, ALL_LAYOUT, phi::DiagKernel, int, int64_t, float, double) {}
+PD_REGISTER_KERNEL(diag,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DiagKernel,
+                   phi::dtype::float16,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/dropout_grad_kernel.cu b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
new file mode 100644
index 0000000000000..94d4942a41878
--- /dev/null
+++ b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dropout_impl.cu.h"
+#include "paddle/phi/kernels/dropout_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutGradRawKernel(const Context& dev_ctx,
+                          const DenseTensor& mask,
+                          const DenseTensor& out_grad,
+                          float p,
+                          bool is_test,
+                          const std::string& mode,
+                          DenseTensor* x_grad) {
+  x_grad->mutable_data<T>(dev_ctx.GetPlace());
+  auto size = x_grad->numel();
+  paddle::operators::DropoutGradGPUKernelDriver<T>(
+      dev_ctx, mode, p, out_grad, mask, size, x_grad, is_test);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dropout_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DropoutGradRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/dropout_kernel.cu b/paddle/phi/kernels/gpu/dropout_kernel.cu
new file mode 100644
index 0000000000000..bd1683ad0c7d8
--- /dev/null
+++ b/paddle/phi/kernels/gpu/dropout_kernel.cu
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dropout_impl.cu.h"
+#include "paddle/phi/kernels/dropout_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      paddle::optional<const DenseTensor&> seed_tensor,
+                      float p,
+                      bool is_test,
+                      const std::string& mode,
+                      int seed,
+                      bool fix_seed,
+                      DenseTensor* out,
+                      DenseTensor* mask) {
+  out->mutable_data<T>(dev_ctx.GetPlace());
+  float dropout_prob = p;
+  bool upscale_in_train = (mode == "upscale_in_train");
+  mask->mutable_data<uint8_t>(dev_ctx.GetPlace());
+
+  paddle::operators::DropoutFwGPUKernelDriver<T>(dev_ctx,
+                                                 is_test,
+                                                 mode,
+                                                 dropout_prob,
+                                                 upscale_in_train,
+                                                 fix_seed,
+                                                 seed,
+                                                 x,
+                                                 seed_tensor.get_ptr(),
+                                                 mask,
+                                                 out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dropout,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DropoutRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/eigh_grad_kernel.cu b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu
index fdf61dc73991d..5e33966055ea0 100644
--- a/paddle/phi/kernels/gpu/eigh_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu
@@ -15,7 +15,6 @@
 #include "paddle/phi/kernels/eigh_grad_kernel.h"
 #include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h"
 
-#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 
diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
index 98df65c92f34c..e5432b5f9187c 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad.h
+++ b/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -360,4 +360,41 @@ void ElementwiseDivGrad(const GPUContext &dev_ctx,
   }
 }
 
+/*
+******************************
+    Mul Grad
+******************************
+*/
+
+template <typename T>
+void ElementwiseMulGrad(const GPUContext &dev_ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        const DenseTensor &dout,
+                        DenseTensor *dx,
+                        DenseTensor *dy,
+                        int axis) {
+  const auto place = dev_ctx.GetPlace();
+
+  if (dx != nullptr && dy != nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &y, &x};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx,
+        place,
+        axis,
+        ins,
+        dout,
+        dx,
+        dy,
+        funcs::MultiplyGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &y};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, funcs::MultiplyGradFunctor<T>());
+  } else if (dx == nullptr && dy != nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &x};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dy, funcs::MultiplyGradFunctor<T>());
+  }
+}
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index 45c8b9a21639f..c4481bf6ce3c3 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -136,6 +136,18 @@ void DivideGradKernel(const Context& dev_ctx,
   }
 }
 
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  ElementwiseMulGrad<T>(dev_ctx, x, y, dout, dx, dy, axis);
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(add_grad,
@@ -228,3 +240,62 @@ PD_REGISTER_KERNEL(divide_double_grad,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyDoubleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_triple_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyTripleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+PD_REGISTER_KERNEL(elementwise_fmax_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMaxGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(elementwise_fmin_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMinGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/elementwise_kernel.cu
similarity index 60%
rename from paddle/phi/kernels/gpu/math_kernel.cu
rename to paddle/phi/kernels/gpu/elementwise_kernel.cu
index af9d5574aa9fe..a57d89013f921 100644
--- a/paddle/phi/kernels/gpu/math_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_kernel.cu
@@ -1,37 +1,22 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/math_kernel.h"
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/kernels/funcs/broadcast_function.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
-#include "paddle/phi/core/compat/convert_utils.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 
 namespace phi {
 
@@ -56,30 +41,6 @@ namespace phi {
  * Kernels
  */
 
-template <typename T, typename Context>
-void MeanRawKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const std::vector<int64_t>& dims,
-                   bool keep_dim,
-                   bool reduce_all,
-                   DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  phi::Reduce<T, kps::AddFunctor, kps::DivideFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-template <typename T, typename Context>
-void SumRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
-                  bool keep_dim,
-                  bool reduce_all,
-                  DataType out_dtype,
-                  DenseTensor* out) {
-  phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
 // Create the definition of Add
 DEFINE_CUDA_ELEMENTWISE_OP(Add)
 // Create the definition of Subtract
@@ -96,6 +57,24 @@ using bfloat16 = phi::dtype::bfloat16;
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
+PD_REGISTER_KERNEL(elementwise_fmax,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMaxKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(elementwise_fmin,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMinKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
 PD_REGISTER_KERNEL(add_raw,
                    GPU,
                    ALL_LAYOUT,
@@ -147,30 +126,3 @@ PD_REGISTER_KERNEL(multiply_raw,
                    complex64,
                    complex128,
                    bfloat16) {}
-PD_REGISTER_KERNEL(sum_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SumRawKernel,
-                   bool,
-                   float,
-                   double,
-                   float16,
-                   bfloat16,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
-
-PD_REGISTER_KERNEL(mean_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MeanRawKernel,
-                   float,
-                   double,
-                   bool,
-                   float16,
-                   int,
-                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/erf_grad_kernel.cu b/paddle/phi/kernels/gpu/erf_grad_kernel.cu
new file mode 100644
index 0000000000000..a06863b0a8777
--- /dev/null
+++ b/paddle/phi/kernels/gpu/erf_grad_kernel.cu
@@ -0,0 +1,27 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/erf_grad_kernel.h"
+#include "paddle/phi/kernels/impl/erf_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(erf_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ErfGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/erf_kernel.cu b/paddle/phi/kernels/gpu/erf_kernel.cu
new file mode 100644
index 0000000000000..8e741be3345fc
--- /dev/null
+++ b/paddle/phi/kernels/gpu/erf_kernel.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/erf_kernel.h"
+#include "paddle/phi/kernels/impl/erf_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    erf, GPU, ALL_LAYOUT, phi::ErfKernel, float, double, phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu b/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
similarity index 55%
rename from paddle/phi/kernels/gpu/reduce_prod_kernel.cu
rename to paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
index 14084d0f4f3c6..273851cfd8b34 100644
--- a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
@@ -12,31 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/core/kernel_registry.h"
-
-#include "paddle/phi/kernels/gpu/reduce.h"
-#include "paddle/phi/kernels/reduce_prod_kernel.h"
-
-namespace phi {
+#include "paddle/phi/kernels/expand_as_grad_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h"
 
-template <typename T, typename Context>
-void ReduceProdKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const std::vector<int64_t>& dims,
-                      bool keep_dim,
-                      bool reduce_all,
-                      DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  phi::Reduce<T, kps::MulFunctor, kps::IdentityFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-}  // namespace phi
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
 
-PD_REGISTER_KERNEL(reduce_prod,
+PD_REGISTER_KERNEL(expand_as_grad,
                    GPU,
                    ALL_LAYOUT,
-                   phi::ReduceProdKernel,
+                   phi::ExpandAsGradKernel,
                    float,
                    double,
                    int,
diff --git a/paddle/phi/kernels/gpu/expand_as_kernel.cu b/paddle/phi/kernels/gpu/expand_as_kernel.cu
new file mode 100644
index 0000000000000..0972eebeabf18
--- /dev/null
+++ b/paddle/phi/kernels/gpu/expand_as_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/expand_as_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(expand_as,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ExpandAsKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/gpu/frobenius_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/frobenius_norm_grad_kernel.cu
new file mode 100644
index 0000000000000..221bf1cb4c68c
--- /dev/null
+++ b/paddle/phi/kernels/gpu/frobenius_norm_grad_kernel.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/frobenius_norm_grad_kernel.h"
+#include "paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(frobenius_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FrobeniusNormGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu b/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu
new file mode 100644
index 0000000000000..012237165b739
--- /dev/null
+++ b/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/frobenius_norm_kernel.h"
+#include "paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    frobenius_norm, GPU, ALL_LAYOUT, phi::FrobeniusNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/gather_grad_kernel.cu b/paddle/phi/kernels/gpu/gather_grad_kernel.cu
new file mode 100644
index 0000000000000..04149a2f9ee41
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gather_grad_kernel.cu
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_kernel.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& index,
+                      const DenseTensor& out_grad,
+                      const Scalar& axis,
+                      bool overwrite,
+                      DenseTensor* x_grad) {
+  const auto& index_type = index.dtype();
+  auto axis_v = axis.to<int>();
+
+  if (axis_v != 0) {
+    if (index_type == DataType::INT32) {
+      phi::funcs::GatherV2GradCUDAFunction<T, int32_t>(
+          &out_grad, &index, axis_v, x_grad, dev_ctx);
+    } else if (index_type == DataType::INT64) {
+      phi::funcs::GatherV2GradCUDAFunction<T, int64_t>(
+          &out_grad, &index, axis_v, x_grad, dev_ctx);
+    }
+    return;
+  }
+
+  dev_ctx.template Alloc<T>(x_grad);
+  auto dxt = EigenVector<T>::Flatten(*x_grad);
+  auto& place = *dev_ctx.eigen_device();
+  dxt.device(place) = dxt.constant(static_cast<T>(0));
+  if (out_grad.numel() == 0) return;
+  if (index_type == DataType::INT32) {
+    phi::funcs::GPUScatterAssign<T, int>(
+        dev_ctx, out_grad, index, x_grad, overwrite);
+  } else if (index_type == DataType::INT64) {
+    phi::funcs::GPUScatterAssign<T, int64_t>(
+        dev_ctx, out_grad, index, x_grad, overwrite);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GatherGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/gather_kernel.cu b/paddle/phi/kernels/gpu/gather_kernel.cu
new file mode 100644
index 0000000000000..7e0c6cc168564
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gather_kernel.cu
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_kernel.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& index,
+                  const Scalar& axis,
+                  DenseTensor* out) {
+  const auto& index_type = index.dtype();
+  auto axis_v = axis.to<int>();
+  if (axis_v != 0) {
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GatherV2CUDAFunction<T, int32_t>(
+          &x, &index, axis_v, out, dev_ctx);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::GatherV2CUDAFunction<T, int64_t>(
+          &x, &index, axis_v, out, dev_ctx);
+    } else if (index_type == phi::DataType::INT16) {
+      phi::funcs::GatherV2CUDAFunction<T, int16_t>(
+          &x, &index, axis_v, out, dev_ctx);
+    }
+    return;
+  }
+
+  dev_ctx.template Alloc<T>(out);
+
+  if (x.numel() == 0) return;
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::GPUGather<T, int>(dev_ctx, x, index, out);
+  } else if (index_type == phi::DataType::INT64) {
+    phi::funcs::GPUGather<T, int64_t>(dev_ctx, x, index, out);
+  } else if (index_type == phi::DataType::INT16) {
+    phi::funcs::GPUGather<T, int16_t>(dev_ctx, x, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GatherKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   int16_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/gather_tree_kernel.cu b/paddle/phi/kernels/gpu/gather_tree_kernel.cu
index a9e73ec37c8ed..2906b81cb4009 100644
--- a/paddle/phi/kernels/gpu/gather_tree_kernel.cu
+++ b/paddle/phi/kernels/gpu/gather_tree_kernel.cu
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/gather_tree_kernel.h"
+
 #include <algorithm>
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gather_tree_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
new file mode 100644
index 0000000000000..2b9be7c615435
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gelu_funcs.h
@@ -0,0 +1,176 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/flags.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+
+DECLARE_bool(use_fast_math);
+
+namespace phi {
+
+#ifdef __NVCC__
+template <bool FastMode>
+static __device__ __forceinline__ float FP32FastTanh(float x) {
+#if __CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000
+  if (FastMode) {
+    float y;
+    asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(y) : "f"(x));
+    return y;
+  }
+#endif
+  return tanhf(x);
+}
+
+template <bool FastMode>
+static __device__ __forceinline__ float FP32GeluFwd(float x) {
+  auto tanh_out =
+      FP32FastTanh<FastMode>(0.79788456f * x * (1.0f + 0.044715f * x * x));
+  return x * 0.5f * (1.0f + tanh_out);
+}
+
+template <bool FastMode>
+static __device__ __forceinline__ float FP32GeluBwd(float x, float y_g) {
+  auto tanh_out =
+      FP32FastTanh<FastMode>(0.79788456f * x * (1.0f + 0.044715f * x * x));
+  auto tmp = 0.5f * x * ((1.0f - tanh_out * tanh_out) *
+                         (0.79788456f + 0.1070322243f * x * x)) +
+             0.5f * (1.0f + tanh_out);
+  return tmp * y_g;
+}
+
+template <int VecSize, bool FastMode>
+static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x,
+                                                 __half* y,
+                                                 size_t n) {
+  size_t offset =
+      static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
+  size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
+  for (; offset < n; offset += stride) {
+    using ArrT = phi::AlignedVector<__half, VecSize>;
+    ArrT in_arr = *reinterpret_cast<const ArrT*>(x + offset);
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      float tmp = __half2float(in_arr[i]);
+      in_arr[i] = __float2half(FP32GeluFwd<FastMode>(tmp));
+    }
+    *reinterpret_cast<ArrT*>(y + offset) = in_arr;
+  }
+}
+
+template <int VecSize, bool FastMode>
+static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x,
+                                                 const __half* y_g,
+                                                 __half* x_g,
+                                                 size_t n) {
+  size_t offset =
+      static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
+  size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
+  for (; offset < n; offset += stride) {
+    using ArrT = phi::AlignedVector<__half, VecSize>;
+    ArrT x_in_arr = *reinterpret_cast<const ArrT*>(x + offset);
+    ArrT y_g_in_arr = *reinterpret_cast<const ArrT*>(y_g + offset);
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      __half2 tmp_fp16_2;
+      tmp_fp16_2.x = x_in_arr[i];
+      tmp_fp16_2.y = y_g_in_arr[i];
+      float2 tmp_fp32_2 = __half22float2(tmp_fp16_2);
+      x_in_arr[i] =
+          __float2half(FP32GeluBwd<FastMode>(tmp_fp32_2.x, tmp_fp32_2.y));
+    }
+    *reinterpret_cast<ArrT*>(x_g + offset) = x_in_arr;
+  }
+}
+
+static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
+    const GPUContext& dev_ctx, const __half* x, __half* y, size_t n) {
+  auto is_aligned = [](const void* p, size_t alignment) {
+    return reinterpret_cast<uintptr_t>(p) % alignment == 0;
+  };
+
+#define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math)      \
+  do {                                                                        \
+    constexpr auto kAlignment =                                               \
+        alignof(phi::AlignedVector<__half, __vec_size>);                      \
+    if (n % __vec_size == 0 && is_aligned(x, kAlignment) &&                   \
+        is_aligned(y, kAlignment)) {                                          \
+      size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
+      size_t block = (n / __vec_size + thread - 1) / thread;                  \
+      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
+      VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block          \
+               << " , thread = " << thread;                                   \
+      FP16FastGeluFwdCUDAKernel<                                              \
+          __vec_size,                                                         \
+          __use_fast_math><<<block, thread, 0, dev_ctx.stream()>>>(x, y, n);  \
+      return true;                                                            \
+    }                                                                         \
+  } while (0)
+
+  if (FLAGS_use_fast_math) {
+    PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, true);
+  } else {
+    PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, false);
+  }
+
+#undef PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL
+  return false;
+}
+
+static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
+    const GPUContext& dev_ctx,
+    const __half* x,
+    const __half* y_g,
+    __half* x_g,
+    size_t n) {
+  auto is_aligned = [](const void* p, size_t alignment) {
+    return reinterpret_cast<uintptr_t>(p) % alignment == 0;
+  };
+
+#define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math)      \
+  do {                                                                        \
+    constexpr auto kAlignment =                                               \
+        alignof(phi::AlignedVector<__half, __vec_size>);                      \
+    if (n % __vec_size == 0 && is_aligned(x, kAlignment) &&                   \
+        is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) &&           \
+        is_aligned(x_g, kAlignment)) {                                        \
+      size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
+      size_t block = (n / __vec_size + thread - 1) / thread;                  \
+      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
+      VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block          \
+               << " , thread = " << thread;                                   \
+      FP16FastGeluBwdCUDAKernel<                                              \
+          __vec_size,                                                         \
+          __use_fast_math><<<block, thread, 0, dev_ctx.stream()>>>(           \
+          x, y_g, x_g, n);                                                    \
+      return true;                                                            \
+    }                                                                         \
+  } while (0)
+
+  if (FLAGS_use_fast_math) {
+    PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, true);
+  } else {
+    PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, false);
+  }
+
+#undef PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL
+  return false;
+}
+#endif
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
new file mode 100644
index 0000000000000..1e21f8d4267bc
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
@@ -0,0 +1,100 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gelu_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/gpu/gelu_funcs.h"
+
+DECLARE_bool(use_fast_math);
+
+namespace phi {
+
+template <typename T>
+struct GeluWithApproximateGradFunctor {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  inline HOSTDEVICE T operator()(T arg_x, T arg_dout) {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType one = static_cast<MPType>(1);
+    MPType half = static_cast<MPType>(0.5);
+    MPType kAlpha = static_cast<MPType>(M_2_SQRTPI * M_SQRT1_2);
+    MPType kBeta =
+        kAlpha * static_cast<MPType>(GELU_CONSTANT) * static_cast<MPType>(3);
+    auto cube_x = x * x * x;
+    auto tanh_out =
+        tanh(kAlpha * ((static_cast<MPType>(GELU_CONSTANT) * cube_x) + x));
+    auto ans =
+        half * (one + tanh_out +
+                (one - tanh_out * tanh_out) * (x * kAlpha + kBeta * cube_x));
+    return static_cast<T>(ans * dout);
+  }
+};
+
+template <typename T>
+struct GeluWithoutApproximateGradFunctor {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  inline HOSTDEVICE T operator()(T arg_x, T arg_dout) {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType dout = static_cast<MPType>(arg_dout);
+    constexpr MPType kBeta = M_2_SQRTPI * M_SQRT1_2 * static_cast<MPType>(0.5);
+    const MPType cdf = normcdf(x);
+    const MPType pdf = exp(static_cast<MPType>(-0.5) * x * x) * kBeta;
+    return static_cast<T>(dout * (cdf + x * pdf));
+  }
+};
+
+template <typename T, typename Context>
+void GeluGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    bool approximate,
+                    DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  std::vector<const DenseTensor*> ins = {&x, &out_grad};
+  std::vector<DenseTensor*> outs = {x_grad};
+  if (approximate) {
+#ifdef __NVCC__
+    if (std::is_same<T, dtype::float16>::value) {
+      size_t n = x.numel();
+      const auto* x_ptr = reinterpret_cast<const __half*>(x.data<T>());
+      const auto* y_g_ptr = reinterpret_cast<const __half*>(out_grad.data<T>());
+      auto* x_g_ptr = reinterpret_cast<__half*>(x_grad->data<T>());
+      if (TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
+              dev_ctx, x_ptr, y_g_ptr, x_g_ptr, n)) {
+        return;
+      }
+    }
+#endif
+    phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
+        dev_ctx, ins, &outs, 0, GeluWithApproximateGradFunctor<T>());
+  } else {
+    phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
+        dev_ctx, ins, &outs, 0, GeluWithoutApproximateGradFunctor<T>());
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gelu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GeluGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/gelu_kernel.cu b/paddle/phi/kernels/gpu/gelu_kernel.cu
new file mode 100644
index 0000000000000..ce6dda2d6cc65
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gelu_kernel.cu
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gelu_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/gpu/gelu_funcs.h"
+
+DECLARE_bool(use_fast_math);
+
+namespace phi {
+
+template <typename T>
+struct GeluWithApproximateFunctor {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  inline HOSTDEVICE T operator()(T arg_x) {
+    // this function is tanh approximation of gelu
+    MPType x = static_cast<MPType>(arg_x);
+    MPType one = static_cast<MPType>(1);
+    MPType half = static_cast<MPType>(0.5);
+    MPType kAlpha = static_cast<MPType>(M_2_SQRTPI * M_SQRT1_2);
+    auto tanh_out =
+        tanh(kAlpha * x * (one + static_cast<MPType>(GELU_CONSTANT) * x * x));
+    MPType out = x * half * (one + tanh_out);
+    return static_cast<T>(out);
+  }
+};
+
+template <typename T>
+struct GeluWithoutApproximateFunctor {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  inline HOSTDEVICE T operator()(T arg_x) {
+    // actual gelu with approximation = false
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(x * normcdf(x));
+  }
+};
+
+template <typename T, typename Context>
+void GeluKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                bool approximate,
+                DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  if (approximate) {
+#ifdef __NVCC__
+    if (std::is_same<T, dtype::float16>::value) {
+      size_t n = x.numel();
+      const auto* in_ptr = reinterpret_cast<const __half*>(x.data<T>());
+      auto* out_ptr = reinterpret_cast<__half*>(out->data<T>());
+      if (TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
+              dev_ctx, in_ptr, out_ptr, n)) {
+        return;
+      }
+    }
+#endif
+    phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
+        dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor<T>());
+  } else {
+    phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
+        dev_ctx, ins, &outs, 0, GeluWithoutApproximateFunctor<T>());
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gelu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GeluKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
new file mode 100644
index 0000000000000..457a348be832b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
@@ -0,0 +1,324 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/grid_sample_utils.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+
+namespace phi {
+
+template <typename T>
+static __forceinline__ __device__ void AtomicAdd(
+    T* data, int h, int w, int sH, int sW, int H, int W, T delta) {
+  if (InBounds(h, w, H, W)) {
+    paddle::platform::CudaAtomicAdd(data + h * sH + w * sW, delta);
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T
+UnnormalizeWithMask(T coord, int size, bool align_corners, T* grad_in) {
+  if (align_corners) {
+    *grad_in = static_cast<T>(size - 1) / 2;
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    *grad_in = static_cast<T>(size) / 2;
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T ClipIndexesWithMask(T in,
+                                                        int clip_limit,
+                                                        T* grad_in) {
+  if (in <= static_cast<T>(0)) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  } else {
+    T max = static_cast<T>(clip_limit - 1);
+    if (in >= max) {
+      *grad_in = static_cast<T>(0);
+      return max;
+    } else {
+      *grad_in = static_cast<T>(1);
+      return in;
+    }
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T
+ReflectIndexesWithMask(T in, int twice_low, int twice_high, T* grad_in) {
+  if (twice_low == twice_high) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  }
+  int grad_in_mult_;
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = in - min;
+  if (in < static_cast<T>(0)) {
+    grad_in_mult_ = -1;
+    in = -in;
+  } else {
+    grad_in_mult_ = 1;
+  }
+  T extra = fmod(in, span);
+  int flips = static_cast<int>(floor(in / span));
+  if (flips % 2 == 0) {
+    *grad_in = static_cast<T>(grad_in_mult_);
+    return extra + min;
+  } else {
+    *grad_in = static_cast<T>(-grad_in_mult_);
+    return span - extra + min;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T
+ComputePositionsWithMask(T coord,
+                         int size,
+                         PaddingMode padding_mode,
+                         bool align_corners,
+                         T* grad_in) {
+  T grad_clip, grad_refl;
+  coord = UnnormalizeWithMask<T>(coord, size, align_corners, grad_in);
+  if (padding_mode == PaddingMode::border) {
+    coord = ClipIndexesWithMask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_clip;
+  } else if (padding_mode == PaddingMode::reflect) {
+    if (align_corners) {
+      coord = ReflectIndexesWithMask(coord, 0, 2 * (size - 1), &grad_refl);
+    } else {
+      coord = ReflectIndexesWithMask(coord, -1, 2 * size - 1, &grad_refl);
+    }
+    coord = ClipIndexesWithMask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_refl * grad_clip;
+  }
+
+  return coord;
+}
+
+template <typename T>
+__global__ void GridSamplerCudaBackwardKernel(const int nthreads,
+                                              const T* grad_output,
+                                              const T* input,
+                                              const T* grid,
+                                              int n,
+                                              int out_c,
+                                              int out_h,
+                                              int out_w,
+                                              int in_h,
+                                              int in_w,
+                                              T* grad_input,
+                                              T* grad_grid,
+                                              const Mode mode,
+                                              const PaddingMode padding_mode,
+                                              bool align_corners) {
+  int inp_sN = out_c * in_h * in_w;
+  int inp_sC = in_h * in_w;
+  int inp_sH = in_w;
+  int inp_sW = 1;
+  int grid_sN = out_h * out_w * 2;
+  int grid_sH = out_w * 2;
+  int grid_sW = 2;
+  int grid_sCoor = 1;
+
+  int gOut_sN = out_c * out_h * out_w;
+  int gOut_sC = out_h * out_w;
+  int gOut_sH = out_w;
+  int gOut_sW = 1;
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_w;
+    const int h = (index / out_w) % out_h;
+    const int n = index / (out_h * out_w);
+    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+
+    T gix_mult, giy_mult;
+    ix = ComputePositionsWithMask(
+        ix, in_w, padding_mode, align_corners, &gix_mult);
+    iy = ComputePositionsWithMask(
+        iy, in_h, padding_mode, align_corners, &giy_mult);
+
+    if (mode == Mode::bilinear) {
+      int ix_nw = static_cast<int>(floor(ix));
+      int iy_nw = static_cast<int>(floor(iy));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+
+      T gix = static_cast<T>(0), giy = static_cast<T>(0);
+      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      int inp_offset_NC = n * inp_sN;
+      for (int c = 0; c < out_c; ++c,
+               inp_offset_NC += inp_sC,
+               gInp_ptr_NC += inp_sC,
+               gOut_offset += gOut_sC) {
+        T gOut = grad_output[gOut_offset];
+
+        AtomicAdd(
+            gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w, nw * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w, ne * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w, sw * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w, se * gOut);
+
+        if (InBounds(iy_nw, ix_nw, in_h, in_w)) {
+          T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW];
+          gix -= nw_val * (iy_se - iy) * gOut;
+          giy -= nw_val * (ix_se - ix) * gOut;
+        }
+        if (InBounds(iy_ne, ix_ne, in_h, in_w)) {
+          T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW];
+          gix += ne_val * (iy_sw - iy) * gOut;
+          giy -= ne_val * (ix - ix_sw) * gOut;
+        }
+        if (InBounds(iy_sw, ix_sw, in_h, in_w)) {
+          T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW];
+          gix -= sw_val * (iy - iy_ne) * gOut;
+          giy += sw_val * (ix_ne - ix) * gOut;
+        }
+        if (InBounds(iy_se, ix_se, in_h, in_w)) {
+          T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW];
+          gix += se_val * (iy - iy_nw) * gOut;
+          giy += se_val * (ix - ix_nw) * gOut;
+        }
+      }
+
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NHW[0] = gix_mult * gix;
+        gGrid_ptr_NHW[1] = giy_mult * giy;
+      }
+    } else if (mode == Mode::nearest) {
+      int ix_nearest = static_cast<int>(std::nearbyint(ix));
+      int iy_nearest = static_cast<int>(std::nearbyint(iy));
+
+      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (int c = 0; c < out_c;
+           ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
+        AtomicAdd(gInp_ptr_NC,
+                  iy_nearest,
+                  ix_nearest,
+                  inp_sH,
+                  inp_sW,
+                  in_h,
+                  in_w,
+                  grad_output[gOut_offset]);
+      }
+
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NHW[0] = static_cast<T>(0);
+        gGrid_ptr_NHW[1] = static_cast<T>(0);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void GridSampleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& grid,
+                          const DenseTensor& out_grad,
+                          const std::string& mode,
+                          const std::string& padding_mode,
+                          bool align_corners,
+                          DenseTensor* x_grad,
+                          DenseTensor* grid_grad) {
+  PaddingMode enum_padding_mode;
+  Mode enum_mode;
+  if (padding_mode == "border") {
+    enum_padding_mode = PaddingMode::border;
+  } else if (padding_mode == "reflection") {
+    enum_padding_mode = PaddingMode::reflect;
+  } else {
+    enum_padding_mode = PaddingMode::zeros;
+  }
+
+  if (mode == "nearest") {
+    enum_mode = Mode::nearest;
+  } else {
+    enum_mode = Mode::bilinear;
+  }
+
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+  const int c = x.dims()[1];
+  const int in_h = x.dims()[2];
+  const int in_w = x.dims()[3];
+
+  dev_ctx.template Alloc<T>(x_grad);
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
+
+  T* grid_grad_data = nullptr;
+  if (grid_grad != nullptr) {
+    grid_grad_data = dev_ctx.template Alloc<T>(grid_grad);
+  }
+
+  int count = static_cast<int>(n * out_h * out_w);
+  auto cu_stream = dev_ctx.stream();
+  backends::gpu::GpuLaunchConfig config =
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+  GridSamplerCudaBackwardKernel<
+      T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
+      count,
+      out_grad.data<T>(),
+      x.data<T>(),
+      grid.data<T>(),
+      n,
+      c,
+      out_h,
+      out_w,
+      in_h,
+      in_w,
+      x_grad->data<T>(),
+      grid_grad_data,
+      enum_mode,
+      enum_padding_mode,
+      align_corners);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(grid_sample_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GridSampleGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/grid_sample_kernel.cu b/paddle/phi/kernels/gpu/grid_sample_kernel.cu
new file mode 100644
index 0000000000000..f611b46911c4f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/grid_sample_kernel.cu
@@ -0,0 +1,233 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/grid_sample_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/grid_sample_utils.h"
+
+namespace phi {
+
+template <typename T>
+static __forceinline__ __device__ T Unnormalize(T coord,
+                                                int size,
+                                                bool align_corners) {
+  if (align_corners) {
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T ClipIndexes(T in, int max_value) {
+  return min(static_cast<T>(max_value), max(in, static_cast<T>(0)));
+}
+
+template <typename T>
+static __forceinline__ __device__ T ReflectIndexes(T in,
+                                                   int twice_low,
+                                                   int twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<T>(0);
+  }
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = fabs(in - min);
+  T extra = fmod(in, span);
+  int flips = static_cast<int>(floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T ComputePositions(T coord,
+                                                     int size,
+                                                     PaddingMode padding_mode,
+                                                     bool align_corners) {
+  coord = Unnormalize<T>(coord, size, align_corners);
+  if (padding_mode == PaddingMode::border) {
+    coord = ClipIndexes(coord, size - 1);
+  } else if (padding_mode == PaddingMode::reflect) {
+    if (align_corners) {
+      coord = ReflectIndexes(coord, 0, 2 * (size - 1));
+    } else {
+      coord = ReflectIndexes(coord, -1, 2 * size - 1);
+    }
+    coord = ClipIndexes(coord, size - 1);
+  }
+  return coord;
+}
+
+template <typename T>
+__global__ void GridSampleCudaKernel(const int nthreads,
+                                     int n,
+                                     int out_c,
+                                     int out_h,
+                                     int out_w,
+                                     int in_h,
+                                     int in_w,
+                                     const T* input,
+                                     const T* grid,
+                                     T* output,
+                                     const Mode mode,
+                                     const PaddingMode padding_mode,
+                                     bool align_corners) {
+  int inp_sN = out_c * in_h * in_w;
+
+  int inp_sC = in_h * in_w;
+  int inp_sH = in_w;
+  int inp_sW = 1;
+  int grid_sN = out_h * out_w * 2;
+  int grid_sH = out_w * 2;
+  int grid_sW = 2;
+  int grid_sCoor = 1;
+  int out_sN = out_c * out_h * out_w;
+  int out_sC = out_h * out_w;
+  int out_sH = out_w;
+  int out_sW = 1;
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_w;
+    const int h = (index / out_w) % out_h;
+    const int n = index / (out_h * out_w);
+    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+
+    ix = ComputePositions(ix, in_w, padding_mode, align_corners);
+    iy = ComputePositions(iy, in_h, padding_mode, align_corners);
+    if (mode == Mode::bilinear) {
+      int ix_nw = static_cast<int>(floor(ix));
+      int iy_nw = static_cast<int>(floor(iy));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+
+      auto inp_offset_NC = n * inp_sN;
+
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        *out_ptr_NCHW = static_cast<T>(0);
+        if (InBounds(iy_nw, ix_nw, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+        }
+        if (InBounds(iy_ne, ix_ne, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+        }
+        if (InBounds(iy_sw, ix_sw, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+        }
+        if (InBounds(iy_se, ix_se, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se;
+        }
+      }
+    } else if (mode == Mode::nearest) {
+      int ix_nearest = static_cast<int>(std::nearbyint(ix));
+      int iy_nearest = static_cast<int>(std::nearbyint(iy));
+      auto inp_offset_NC = n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        if (InBounds(iy_nearest, ix_nearest, in_h, in_w)) {
+          *out_ptr_NCHW =
+              input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCHW = static_cast<T>(0);
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void GridSampleKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& grid,
+                      const std::string& mode,
+                      const std::string& padding_mode,
+                      bool align_corners,
+                      DenseTensor* out) {
+  PaddingMode enum_padding_mode;
+  Mode enum_mode;
+  if (padding_mode == "border") {
+    enum_padding_mode = PaddingMode::border;
+  } else if (padding_mode == "reflection") {
+    enum_padding_mode = PaddingMode::reflect;
+  } else {
+    enum_padding_mode = PaddingMode::zeros;
+  }
+
+  if (mode == "nearest") {
+    enum_mode = Mode::nearest;
+  } else {
+    enum_mode = Mode::bilinear;
+  }
+
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+  const int c = x.dims()[1];
+  const int in_h = x.dims()[2];
+  const int in_w = x.dims()[3];
+  VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
+          << "; out_w: " << out_w;
+
+  auto* output_data = dev_ctx.template Alloc<T>(out);
+  VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; "
+          << out->dims()[2] << "; " << out->dims()[3];
+
+  int count = static_cast<int>(n * out_h * out_w);
+  auto cu_stream = dev_ctx.stream();
+  backends::gpu::GpuLaunchConfig config =
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+  GridSampleCudaKernel<
+      T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
+      count,
+      n,
+      c,
+      out_h,
+      out_w,
+      in_h,
+      in_w,
+      x.data<T>(),
+      grid.data<T>(),
+      output_data,
+      enum_mode,
+      enum_padding_mode,
+      align_corners);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    grid_sample, GPU, ALL_LAYOUT, phi::GridSampleKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/grid_sample_utils.h b/paddle/phi/kernels/gpu/grid_sample_utils.h
new file mode 100644
index 0000000000000..098eb9defb549
--- /dev/null
+++ b/paddle/phi/kernels/gpu/grid_sample_utils.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+
+enum class Mode {
+  bilinear,
+  nearest,
+};
+
+enum class PaddingMode { zeros, border, reflect };
+
+static __forceinline__ __device__ bool InBounds(int h, int w, int H, int W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/index_sample_kernel.cu b/paddle/phi/kernels/gpu/index_sample_kernel.cu
index 0e042089e1e3d..68573d5596646 100644
--- a/paddle/phi/kernels/gpu/index_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu
@@ -35,7 +35,7 @@ void LimitGridDim(const Context& ctx, dim3* grid_dim) {
 #define PREDEFINED_BLOCK_SIZE_X 512
 #define PREDEFINED_BLOCK_SIZE 1024
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
-}
+}  // namespace
 
 template <typename T, typename IndexT = int>
 __global__ void IndexSampleForward(const IndexT* index,
diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
new file mode 100644
index 0000000000000..a393eecd51242
--- /dev/null
+++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
@@ -0,0 +1,141 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_select_grad_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, typename IndexT>
+__global__ void index_select_grad_cuda_kernel(const T* output_grad,
+                                              T* input_grad,
+                                              const IndexT* index,
+                                              int64_t nums,
+                                              int64_t N,
+                                              int64_t stride,
+                                              int64_t size,
+                                              int64_t delta) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+
+  int64_t pre_idx = idx / (stride * size);
+  int64_t dim_idx = idx % (stride * size) / stride;
+  IndexT src_dim_idx = index[dim_idx];
+  int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
+  paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]);
+}
+
+template <typename T>
+__global__ void index_select_grad_init(T* input_grad, int64_t N) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+  input_grad[idx] = 0.0;
+}
+
+template <typename T, typename Context>
+void IndexSelectGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& index,
+                           const DenseTensor& out_grad,
+                           int dim,
+                           DenseTensor* x_grad) {
+  auto* output_grad_data = out_grad.data<T>();
+  auto* in_grad_data = ctx.template Alloc<T>(x_grad);
+
+  auto input_dim = x_grad->dims();
+  auto output_dim = out_grad.dims();
+  dim = dim >= 0 ? dim : dim + input_dim.size();
+  auto stride_dim = phi::stride(input_dim);
+  int64_t stride = stride_dim[dim];
+  int64_t size = output_dim[dim];
+  int64_t delta = input_dim[dim] - size;
+  const auto& index_type = index.dtype();
+
+  bool index_type_match =
+      index_type == phi::DataType::INT64 || index_type == phi::DataType::INT32;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input(Index) holds the wrong type, it holds %s, but "
+                        "desires to be %s or %s",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  int64_t numel = x_grad->numel();
+  int64_t index_nums = index.numel();
+  int64_t out_nums = out_grad.numel();
+
+  auto stream = ctx.stream();
+
+  index_select_grad_init<
+      T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+           PADDLE_CUDA_NUM_THREADS,
+           0,
+           stream>>>(in_grad_data, numel);
+
+  if (index_type == phi::DataType::INT64) {
+    const int64_t* index_data = index.data<int64_t>();
+    index_select_grad_cuda_kernel<T, int64_t><<<
+        (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+        PADDLE_CUDA_NUM_THREADS,
+        0,
+        stream>>>(output_grad_data,
+                  in_grad_data,
+                  index_data,
+                  index_nums,
+                  out_nums,
+                  stride,
+                  size,
+                  delta);
+    phi::backends::gpu::GpuStreamSync(stream);
+  } else {
+    const int* index_data = index.data<int>();
+    index_select_grad_cuda_kernel<T, int><<<
+        (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+        PADDLE_CUDA_NUM_THREADS,
+        0,
+        stream>>>(output_grad_data,
+                  in_grad_data,
+                  index_data,
+                  index_nums,
+                  out_nums,
+                  stride,
+                  size,
+                  delta);
+    phi::backends::gpu::GpuStreamSync(stream);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_select_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IndexSelectGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/index_select_kernel.cu b/paddle/phi/kernels/gpu/index_select_kernel.cu
new file mode 100644
index 0000000000000..f774522318acb
--- /dev/null
+++ b/paddle/phi/kernels/gpu/index_select_kernel.cu
@@ -0,0 +1,109 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_select_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, typename IndexT>
+__global__ void index_select_cuda_kernel(const T* input,
+                                         T* output,
+                                         const IndexT* index,
+                                         int64_t N,
+                                         int64_t stride,
+                                         int64_t size,
+                                         int64_t delta) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+
+  int64_t pre_idx = idx / (stride * size);
+  int64_t dim_idx = idx % (stride * size) / stride;
+  IndexT src_dim_idx = index[dim_idx];
+  int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
+  output[idx] = input[input_idx];
+}
+
+template <typename T, typename Context>
+void IndexSelectKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& index,
+                       int dim,
+                       DenseTensor* output) {
+  auto input_dim = x.dims();
+  auto output_dim = output->dims();
+  dim = dim >= 0 ? dim : dim + input_dim.size();
+  auto stride_dim = phi::stride(input_dim);
+  int64_t stride = stride_dim[dim];
+  int64_t size = output_dim[dim];
+  int64_t delta = input_dim[dim] - size;
+  const auto& index_type = index.dtype();
+
+  bool index_type_match =
+      index_type == phi::DataType::INT64 || index_type == phi::DataType::INT32;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input(Index) holds the wrong type, it holds %s, but "
+                        "desires to be %s or %s",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  auto* in_data = x.data<T>();
+  T* out_data = ctx.template Alloc<T>(output);
+
+  int64_t numel = output->numel();
+  auto stream = ctx.stream();
+
+  if (index_type == phi::DataType::INT64) {
+    const int64_t* index_data = index.data<int64_t>();
+    index_select_cuda_kernel<T, int64_t><<<
+        (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+        PADDLE_CUDA_NUM_THREADS,
+        0,
+        stream>>>(in_data, out_data, index_data, numel, stride, size, delta);
+    phi::backends::gpu::GpuStreamSync(stream);
+  } else {
+    const int* index_data = index.data<int>();
+    index_select_cuda_kernel<
+        T,
+        int><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+               PADDLE_CUDA_NUM_THREADS,
+               0,
+               stream>>>(
+        in_data, out_data, index_data, numel, stride, size, delta);
+    phi::backends::gpu::GpuStreamSync(stream);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_select,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IndexSelectKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/isclose_kernel.cu b/paddle/phi/kernels/gpu/isclose_kernel.cu
new file mode 100644
index 0000000000000..34774ec715c48
--- /dev/null
+++ b/paddle/phi/kernels/gpu/isclose_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/isclose_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/isclose_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    isclose, GPU, ALL_LAYOUT, phi::IscloseKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu
new file mode 100644
index 0000000000000..8ca53f021f054
--- /dev/null
+++ b/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kldiv_loss_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h"
+PD_REGISTER_KERNEL(
+    kldiv_loss_grad, GPU, ALL_LAYOUT, phi::KLDivLossGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu b/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu
new file mode 100644
index 0000000000000..9388ac7071c31
--- /dev/null
+++ b/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kldiv_loss_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h"
+PD_REGISTER_KERNEL(
+    kldiv_loss, GPU, ALL_LAYOUT, phi::KLDivLossKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/kron_grad_kernel.cu b/paddle/phi/kernels/gpu/kron_grad_kernel.cu
new file mode 100644
index 0000000000000..13ef2adaab3f3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/kron_grad_kernel.cu
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kron_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kron_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(kron_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::KronGradKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/kron_kernel.cu b/paddle/phi/kernels/gpu/kron_kernel.cu
new file mode 100644
index 0000000000000..a2124fd5af7d7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/kron_kernel.cu
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kron_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kron_kernel_impl.h"
+
+PD_REGISTER_KERNEL(kron,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::KronKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu
new file mode 100644
index 0000000000000..f6e96046a2bd7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kthvalue_grad_kernel.h"
+
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+static int getBlockSize(int col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256 && col <= 512)
+    return 512;
+  else if (col > 128 && col <= 256)
+    return 256;
+  else if (col > 64 && col <= 128)
+    return 128;
+  else
+    return 64;
+}
+
+template <typename T, typename Context>
+void KthvalueGradKernel(const Context& dev_ctx,
+                        const DenseTensor& d_out,
+                        const DenseTensor& x,
+                        const DenseTensor& indices,
+                        int k,
+                        int axis,
+                        bool keepdim,
+                        DenseTensor* d_x) {
+  const auto& in_dims = x.dims();
+  auto out_dims = indices.dims();
+  if (axis < 0) axis += in_dims.size();
+  T* x_grad_data = dev_ctx.template Alloc<T>(d_x);
+  const T* out_grad_data = d_out.data<T>();
+  const int64_t* indices_data = indices.data<int64_t>();
+  int pre, n, post;
+  paddle::operators::GetDims(in_dims, axis, &pre, &n, &post);
+  int block_size = getBlockSize(post * k);
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+  int grid_size = std::min(max_blocks, pre);
+  paddle::operators::AssignGradWithAxis<
+      T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+      out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(kthvalue_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::KthvalueGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/kthvalue_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
new file mode 100644
index 0000000000000..4218e153ec29b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
@@ -0,0 +1,252 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kthvalue_kernel.h"
+
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+inline int getBlockSize(int col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256 && col <= 512)
+    return 512;
+  else if (col > 128 && col <= 256)
+    return 256;
+  else if (col > 64 && col <= 128)
+    return 128;
+  else
+    return 64;
+}
+
+template <typename T>
+bool SortKthvalue(const phi::GPUContext& dev_ctx,
+                  const DenseTensor* input_tensor,
+                  const int64_t num_cols,
+                  const int64_t num_rows,
+                  const int k,
+                  DenseTensor* out_tensor,
+                  DenseTensor* indices_tensor) {
+  auto cu_stream = dev_ctx.stream();
+  DenseTensor input_indices;
+  const std::vector<int64_t> dims = {num_rows, num_cols};
+  auto dim = phi::make_ddim(dims);
+  input_indices.Resize(dim);
+  dev_ctx.template Alloc<int64_t>(&input_indices);
+  size_t temp_storage_bytes = -1;
+  int block_size = getBlockSize(num_cols);
+  unsigned int maxGridDimX = dev_ctx.GetCUDAMaxGridDimSize()[0];
+  unsigned int grid_size = num_rows < maxGridDimX
+                               ? static_cast<unsigned int>(num_rows)
+                               : maxGridDimX;
+  paddle::operators::InitIndex<
+      int64_t><<<grid_size, block_size, 0, cu_stream>>>(
+      input_indices.data<int64_t>(), num_rows, num_cols);
+  cub::CountingInputIterator<int64_t> counting_iter(0);
+  cub::TransformInputIterator<int64_t,
+                              paddle::operators::SegmentOffsetIter,
+                              cub::CountingInputIterator<int64_t>>
+      segment_offsets_t(counting_iter,
+                        paddle::operators::SegmentOffsetIter(num_cols));
+  T* sorted_values_ptr;
+  int64_t* sorted_indices_ptr;
+  DenseTensor temp_values, temp_indices;
+  const T* input = input_tensor->data<T>();
+  T* values = out_tensor->data<T>();
+  int64_t* indices = indices_tensor->mutable_data<int64_t>(dev_ctx.GetPlace());
+  temp_values.Resize(dim);
+  temp_indices.Resize(dim);
+  sorted_values_ptr = dev_ctx.template Alloc<T>(&temp_values);
+  sorted_indices_ptr = dev_ctx.template Alloc<int64_t>(&temp_indices);
+  auto err =
+      cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
+                                               temp_storage_bytes,
+                                               input,
+                                               sorted_values_ptr,
+                                               input_indices.data<int64_t>(),
+                                               sorted_indices_ptr,
+                                               num_cols * num_rows,
+                                               num_rows,
+                                               segment_offsets_t,
+                                               segment_offsets_t + 1,
+                                               0,
+                                               sizeof(T) * 8,
+                                               cu_stream);
+#ifdef __HIPCC__
+  if (err != hipSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "hipcub::DeviceSegmentedRadixSort::SortPairs, status: "
+               << hipGetErrorString(err);
+    return false;
+  }
+#else
+  if (err != cudaSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "cub::DeviceSegmentedRadixSort::SortPairs, status: "
+               << cudaGetErrorString(err);
+    return false;
+  }
+#endif
+  DenseTensor temp_storage;
+  temp_storage.Resize({static_cast<int>(temp_storage_bytes / sizeof(uint8_t))});
+  uint8_t* temp_storage_data = dev_ctx.template Alloc<uint8_t>(&temp_storage);
+
+  err = cub::DeviceSegmentedRadixSort::SortPairs(temp_storage_data,
+                                                 temp_storage_bytes,
+                                                 input,
+                                                 sorted_values_ptr,
+                                                 input_indices.data<int64_t>(),
+                                                 sorted_indices_ptr,
+                                                 num_cols * num_rows,
+                                                 num_rows,
+                                                 segment_offsets_t,
+                                                 segment_offsets_t + 1,
+                                                 0,
+                                                 sizeof(T) * 8,
+                                                 cu_stream);
+#ifdef __HIPCC__
+  if (err != hipSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "hipcub::DeviceSegmentedRadixSort::SortPairs, "
+               << temp_storage_bytes << ", status: " << hipGetErrorString(err);
+    return false;
+  }
+#else
+  if (err != cudaSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "cub::DeviceSegmentedRadixSort::SortPairs, "
+               << temp_storage_bytes << ", status: " << cudaGetErrorString(err);
+    return false;
+  }
+#endif
+  auto& dev = *dev_ctx.eigen_device();
+  const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, k - 1};
+  const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, 1};
+  auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
+  auto e_tmp_indices =
+      EigenMatrix<int64_t>::From(static_cast<const DenseTensor>(temp_indices));
+  std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(1)};
+  dim = phi::make_ddim(odims);
+  auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
+  auto e_tmp_values =
+      EigenMatrix<T>::From(static_cast<const DenseTensor>(temp_values));
+
+  funcs::EigenSlice<std::decay_t<decltype(dev)>, int64_t, 2>::Eval(
+      dev, e_indices, e_tmp_indices, slice_indices, slice_sizes);
+  funcs::EigenSlice<std::decay_t<decltype(dev)>, T, 2>::Eval(
+      dev, e_values, e_tmp_values, slice_indices, slice_sizes);
+  return true;
+}
+
+template <typename T, typename Context>
+void KthvalueKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    int k,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* output,
+                    DenseTensor* indices) {
+  const auto& in_dims = x.dims();
+  if (axis < 0) axis += in_dims.size();
+  auto out_dims = output->dims();
+  const T* input_data = x.data<T>();
+  T* output_data = dev_ctx.template Alloc<T>(output);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+
+  if (axis == in_dims.size() - 1) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    PADDLE_ENFORCE_EQ(
+        SortKthvalue<T>(
+            dev_ctx, &x, input_width, input_height, k, output, indices),
+        true,
+        phi::errors::External("KthvalueOP: Error when use cub sorting"));
+    return;
+  } else {
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+    if (!keepdim) {
+      std::vector<int> tmp_out_shape;
+      for (int i = 0; i < axis; i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      tmp_out_shape.emplace_back(1);
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
+      output->Resize(tmp_out_dims);
+      indices->Resize(tmp_out_dims);
+    }
+    DDim trans_dims(in_dims);
+    DDim trans_out_dims(in_dims);
+    for (int i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+      trans_out_dims[i] = in_dims[trans[i]];
+    }
+    trans_out_dims[in_dims.size() - 1] = 1;
+    DenseTensor trans_input;
+    trans_input.mutable_data<T>(trans_dims, dev_ctx.GetPlace());
+    int ndims = trans.size();
+    funcs::TransCompute<phi::GPUContext, T>(
+        ndims, dev_ctx, x, &trans_input, trans);
+    DenseTensor trans_ind, trans_out;
+    trans_ind.mutable_data<int64_t>(trans_out_dims, dev_ctx.GetPlace());
+    trans_out.mutable_data<T>(trans_out_dims, dev_ctx.GetPlace());
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+    PADDLE_ENFORCE_EQ(
+        SortKthvalue<T>(dev_ctx,
+                        &trans_input,
+                        input_width,
+                        input_height,
+                        k,
+                        &trans_out,
+                        &trans_ind),
+        true,
+        phi::errors::External("KthvalueOP: Error when use cub sorting"));
+    funcs::TransCompute<phi::GPUContext, int64_t>(
+        ndims, dev_ctx, trans_ind, indices, trans);
+    funcs::TransCompute<phi::GPUContext, T>(
+        ndims, dev_ctx, trans_out, output, trans);
+    if (!keepdim) {
+      output->Resize(out_dims);
+      indices->Resize(out_dims);
+    }
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(kthvalue,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::KthvalueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
new file mode 100644
index 0000000000000..c3f7a5261712a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
@@ -0,0 +1,139 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_grad_kernel.h"
+
+#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/layer_norm_util.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormGradKernel(const Context &dev_ctx,
+                         const DenseTensor &x,
+                         const DenseTensor &mean,
+                         const DenseTensor &variance,
+                         paddle::optional<const DenseTensor &> scale_opt,
+                         paddle::optional<const DenseTensor &> bias_opt,
+                         const DenseTensor &out_grad,
+                         float epsilon,
+                         int begin_norm_axis,
+                         bool is_test,
+                         DenseTensor *x_grad,
+                         DenseTensor *scale_grad,
+                         DenseTensor *bias_grad) {
+  using U = paddle::operators::LayerNormParamType<T>;
+  // d_x, d_scale, d_bias may be nullptr
+  auto *d_x = x_grad;
+  auto *d_scale = scale_grad;
+  auto *d_bias = bias_grad;
+
+  auto *scale = scale_opt.get_ptr();
+  auto *bias = bias_opt.get_ptr();
+  auto *d_y = &out_grad;
+
+  const auto &x_dims = x.dims();
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
+  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
+
+  auto *x_data = x.data<T>();
+  auto *d_y_data = d_y->data<T>();
+
+  auto *mean_data = mean.data<U>();
+  auto *var_data = variance.data<U>();
+
+  auto *d_x_data = (d_x == nullptr ? nullptr : dev_ctx.template Alloc<T>(d_x));
+
+  auto x_dtype = x.dtype();
+
+  phi::DataType scale_bias_dtype;
+  if (scale != nullptr) {
+    scale_bias_dtype = scale->dtype();
+  } else {
+    // FIXME(zengjinle): do not find a better way to get the right
+    // data type of the d_scale and d_bias if scale == nullptr.
+    if (bias != nullptr) {
+      scale_bias_dtype = bias->dtype();
+    } else {
+      scale_bias_dtype = x_dtype;
+    }
+  }
+
+#define PADDLE_LAUNCH_LAYERNORM_BWD(ScaleBiasT, IsScaleBiasSameDTypeWithX)  \
+  do {                                                                      \
+    auto *scale_data =                                                      \
+        (scale == nullptr ? nullptr : scale->data<ScaleBiasT>());           \
+    auto *d_scale_data =                                                    \
+        (d_scale == nullptr ? nullptr                                       \
+                            : dev_ctx.template Alloc<ScaleBiasT>(d_scale)); \
+    auto *d_bias_data =                                                     \
+        (d_bias == nullptr ? nullptr                                        \
+                           : dev_ctx.template Alloc<ScaleBiasT>(d_bias));   \
+    auto *d_x_data =                                                        \
+        (d_x == nullptr ? nullptr : dev_ctx.template Alloc<T>(d_x));        \
+    paddle::operators::LayerNormBackward<T, U, IsScaleBiasSameDTypeWithX>(  \
+        x_data,                                                             \
+        d_y_data,                                                           \
+        scale_data,                                                         \
+        mean_data,                                                          \
+        var_data,                                                           \
+        d_x_data,                                                           \
+        d_scale_data,                                                       \
+        d_bias_data,                                                        \
+        epsilon,                                                            \
+        batch_size,                                                         \
+        feature_size,                                                       \
+        dev_ctx);                                                           \
+  } while (0)
+
+  if (scale_bias_dtype == x_dtype) {
+    PADDLE_LAUNCH_LAYERNORM_BWD(T, true);
+  } else {
+    PADDLE_LAUNCH_LAYERNORM_BWD(U, false);
+  }
+
+#undef PADDLE_LAUNCH_LAYERNORM_BWD
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(layer_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#elif CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(layer_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(layer_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
new file mode 100644
index 0000000000000..d87b7c2193811
--- /dev/null
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -0,0 +1,229 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_kernel.h"
+
+#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/layer_norm_util.h"
+
+namespace phi {
+
+template <typename T>
+void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
+                                               const T *input,
+                                               std::vector<int> input_shape,
+                                               const T *bias,
+                                               const T *scale,
+                                               T *output,
+                                               T *mean,
+                                               T *variance,
+                                               int begin_norm_axis,
+                                               float eps) {
+  const auto x_dims = phi::make_ddim(input_shape);
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
+  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
+  switch (paddle::operators::GetDesiredBlockDim(feature_size)) {
+    FIXED_BLOCK_DIM_CASE(paddle::operators::LayerNormForward<
+                         T,
+                         T,
+                         kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
+        input, scale, bias, output, mean, variance, eps, feature_size));
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "Product from begin_norm_axis to end in layer_norm must be larger "
+          "than 1"));
+      break;
+  }
+}
+
+template class LayerNormDirectCUDAFunctor<float>;
+
+template <typename T, typename Context>
+void LayerNormKernel(const Context &dev_ctx,
+                     const DenseTensor &x,
+                     paddle::optional<const DenseTensor &> scale_opt,
+                     paddle::optional<const DenseTensor &> bias_opt,
+                     float epsilon,
+                     int begin_norm_axis,
+                     bool is_test,
+                     DenseTensor *y,
+                     DenseTensor *mean,
+                     DenseTensor *var) {
+  using U = paddle::operators::LayerNormParamType<T>;
+  auto *scale = scale_opt.get_ptr();
+  auto *bias = bias_opt.get_ptr();
+
+  const auto x_dims = x.dims();
+  auto *x_data = x.data<T>();
+  auto *y_data = dev_ctx.template Alloc<T>(y);
+  auto *mean_data = dev_ctx.template Alloc<U>(mean);
+  auto *var_data = dev_ctx.template Alloc<U>(var);
+
+  auto *void_scale_data = (scale == nullptr ? nullptr : scale->data());
+  auto *void_bias_data = (bias == nullptr ? nullptr : bias->data());
+
+  auto x_dtype = x.dtype();
+  phi::DataType scale_bias_dtype;
+  if (void_scale_data != nullptr) {
+    scale_bias_dtype = scale->dtype();
+    if (void_bias_data != nullptr) {
+      PADDLE_ENFORCE_EQ(
+          scale->dtype(),
+          bias->dtype(),
+          phi::errors::InvalidArgument("Thie Scale and Bias of layer_norm op "
+                                       "should have the same data type."));
+    }
+  } else {
+    scale_bias_dtype = (void_bias_data != nullptr ? bias->dtype() : x_dtype);
+  }
+
+  bool is_scale_bias_same_dtype_with_x = x_dtype == scale_bias_dtype;
+  if (!is_scale_bias_same_dtype_with_x) {
+    PADDLE_ENFORCE_EQ(scale_bias_dtype,
+                      paddle::experimental::CppTypeToDataType<U>::Type(),
+                      phi::errors::InvalidArgument(
+                          "Unsupported data type of Scale and Bias"));
+  }
+
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
+  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
+
+  auto stream = dev_ctx.stream();
+
+#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \
+  do {                                                                     \
+    switch (paddle::operators::GetDesiredBlockDim(feature_size)) {         \
+      FIXED_BLOCK_DIM_CASE(paddle::operators::LayerNormForward<            \
+                           T,                                              \
+                           U,                                              \
+                           kBlockDim,                                      \
+                           IsScaleBiasSameDTypeWithX><<<batch_size,        \
+                                                        kBlockDim,         \
+                                                        0,                 \
+                                                        stream>>>(         \
+          x_data,                                                          \
+          static_cast<const ScaleBiasT *>(void_scale_data),                \
+          static_cast<const ScaleBiasT *>(void_bias_data),                 \
+          y_data,                                                          \
+          mean_data,                                                       \
+          var_data,                                                        \
+          epsilon,                                                         \
+          feature_size));                                                  \
+      default:                                                             \
+        PADDLE_THROW(phi::errors::InvalidArgument(                         \
+            "Product from begin_norm_axis to end must be larger than 1")); \
+        break;                                                             \
+    }                                                                      \
+  } while (0)
+
+#ifdef PADDLE_WITH_CUDA
+  bool can_call_1024_kernel = false;
+  if (feature_size == 1024 && scale != nullptr && bias != nullptr) {
+    can_call_1024_kernel = true;
+  }
+  if (can_call_1024_kernel) {
+    const int WARPS_M = 4;
+    const int WARPS_N = 1;
+    const int THREADS_PER_WARP = 32;
+    const int BYTES_PER_LDG = 16;
+    const int VecSize = BYTES_PER_LDG / sizeof(T);
+
+    const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;
+    const int ROWS_PER_CTA = WARPS_M;
+
+    const int grid = static_cast<int>(
+        std::ceil(batch_size / static_cast<float>(ROWS_PER_CTA)));
+    if (is_scale_bias_same_dtype_with_x) {
+      paddle::operators::ln_fwd_1024_kernel<
+          T,
+          U,
+          T,
+          VecSize,
+          WARPS_M,
+          WARPS_N,
+          BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
+          batch_size,
+          feature_size,
+          epsilon,
+          x_data,
+          static_cast<const T *>(void_scale_data),
+          static_cast<const T *>(void_bias_data),
+          mean_data,
+          var_data,
+          y_data);
+    } else {
+      paddle::operators::ln_fwd_1024_kernel<
+          T,
+          U,
+          U,
+          VecSize,
+          WARPS_M,
+          WARPS_N,
+          BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
+          batch_size,
+          feature_size,
+          epsilon,
+          x_data,
+          static_cast<const U *>(void_scale_data),
+          static_cast<const U *>(void_bias_data),
+          mean_data,
+          var_data,
+          y_data);
+    }
+  } else {
+#endif
+    if (is_scale_bias_same_dtype_with_x) {
+      PADDLE_LAUNCH_LAYERNORM_FWD(T, true);
+    } else {
+      PADDLE_LAUNCH_LAYERNORM_FWD(U, false);
+    }
+#ifdef PADDLE_WITH_CUDA
+  }
+#endif
+
+#undef PADDLE_LAUNCH_LAYERNORM_FWD
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(layer_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormKernel,
+                   float,
+                   phi::dtype::float16) {}
+#elif CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(layer_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(layer_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu b/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu
new file mode 100644
index 0000000000000..3e4cd21a658f1
--- /dev/null
+++ b/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/lgamma_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h"
+PD_REGISTER_KERNEL(
+    lgamma_grad, GPU, ALL_LAYOUT, phi::LgammaGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/lgamma_kernel.cu b/paddle/phi/kernels/gpu/lgamma_kernel.cu
new file mode 100644
index 0000000000000..e94d67f4ce324
--- /dev/null
+++ b/paddle/phi/kernels/gpu/lgamma_kernel.cu
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/lgamma_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+
+namespace phi {
+template <typename T>
+struct CudaLgammaFunctor {
+  __device__ __forceinline__ T operator()(const T x) const {
+    return Eigen::numext::lgamma(x);
+  }
+};
+template <typename T, typename Context>
+void LgammaKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) {
+  // XKTODO( add gpu kernel implementation. )
+  dev_ctx.template Alloc<T>(out);
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  auto functor = CudaLgammaFunctor<T>();
+  phi::funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(lgamma, GPU, ALL_LAYOUT, phi::LgammaKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
new file mode 100644
index 0000000000000..f7b282536558d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_softmax_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogSoftmaxGradKernel(const Context &dev_ctx,
+                          const DenseTensor &out,
+                          const DenseTensor &out_grad,
+                          int axis,
+                          DenseTensor *x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  phi::SoftmaxBackwardCUDAKernelDriver<T, true>(
+      dev_ctx, out, out_grad, axis, x_grad);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(log_softmax_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogSoftmaxGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(log_softmax_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogSoftmaxGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
new file mode 100644
index 0000000000000..d7e34c6c14e7a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_softmax_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogSoftmaxKernel(const Context &dev_ctx,
+                      const DenseTensor &x,
+                      int axis,
+                      DenseTensor *out) {
+  dev_ctx.template Alloc<T>(out);
+  phi::SoftmaxForwardCUDAKernelDriver<T, true>(dev_ctx, x, axis, out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(log_softmax,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogSoftmaxKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(log_softmax,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogSoftmaxKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/masked_select_kernel.cu b/paddle/phi/kernels/gpu/masked_select_kernel.cu
index fc4adca2f4243..b443ae6b8fb5e 100644
--- a/paddle/phi/kernels/gpu/masked_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_kernel.cu
@@ -19,34 +19,27 @@
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/select_impl.cu.h"
 #include "paddle/phi/kernels/masked_select_kernel.h"
 
 namespace phi {
 
-__global__ void SetMaskArray(const bool* mask, int32_t* mask_array, int size) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    if (mask[idx])
-      mask_array[idx] = 1;
-    else
-      mask_array[idx] = 0;
-  }
-}
+template <typename MT, typename InT, typename OutT>
+struct MaskedSelectFunctor {
+  HOSTDEVICE MaskedSelectFunctor() {}
 
-template <typename T>
-__global__ void SelectWithPrefixMask(const int32_t* mask_prefix_sum,
-                                     const bool* mask,
-                                     const T* input,
-                                     T* out,
-                                     int size) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    if (mask[idx]) {
-      int index = mask_prefix_sum[idx];
-      out[index] = input[idx];
+  HOSTDEVICE inline void operator()(OutT* out,
+                                    const MT* mask,
+                                    const InT* value,
+                                    int num) {
+    int store_fix = 0;
+    for (int idx = 0; idx < num; idx++) {
+      if (mask[idx]) {
+        out[store_fix++] = value[idx];
+      }
     }
   }
-}
+};
 
 template <typename T, typename Context>
 void MaskedSelectKernel(const Context& dev_ctx,
@@ -68,42 +61,9 @@ void MaskedSelectKernel(const Context& dev_ctx,
                         "value.",
                         input_dim,
                         mask_dim));
-
-  thrust::device_ptr<const bool> mask_dev_ptr =
-      thrust::device_pointer_cast(mask_data);
-  thrust::device_vector<T> mask_vec(mask_dev_ptr, mask_dev_ptr + mask_size);
-  auto out_size = thrust::count(mask_vec.begin(), mask_vec.end(), true);
-
-  DDim out_dim{out_size};
-  out->Resize(out_dim);
-  auto out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-
-  DenseTensor mask_array;
-  DenseTensor mask_prefix_sum;
-  mask_array.Resize(mask_dim);
-  mask_prefix_sum.Resize(mask_dim);
-
-  int32_t* mask_array_data =
-      mask_array.mutable_data<int32_t>(dev_ctx.GetPlace());
-  int32_t* mask_prefix_sum_data =
-      mask_prefix_sum.mutable_data<int32_t>(dev_ctx.GetPlace());
-  int threads = 512;
-  int grid = (mask_size + threads - 1) / threads;
-  auto stream = dev_ctx.stream();
-  SetMaskArray<<<grid, threads, 0, stream>>>(
-      mask_data, mask_array_data, mask_size);
-
-  thrust::device_ptr<int32_t> mask_array_dev_ptr =
-      thrust::device_pointer_cast(mask_array_data);
-  thrust::device_vector<int32_t> mask_array_vec(mask_array_dev_ptr,
-                                                mask_array_dev_ptr + mask_size);
-  thrust::exclusive_scan(thrust::device,
-                         mask_array_vec.begin(),
-                         mask_array_vec.end(),
-                         mask_prefix_sum_data);
-
-  SelectWithPrefixMask<T><<<grid, threads, 0, stream>>>(
-      mask_prefix_sum_data, mask_data, input_data, out_data, mask_size);
+  using Functor = MaskedSelectFunctor<bool, T, T>;
+  phi::funcs::SelectKernel<bool, T, T, 1, Functor>(
+      dev_ctx, mask, x, out, Functor());
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu b/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu
new file mode 100644
index 0000000000000..25a9de8f8bed4
--- /dev/null
+++ b/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/matrix_power_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(matrix_power_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MatrixPowerGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/matrix_power_kernel.cu b/paddle/phi/kernels/gpu/matrix_power_kernel.cu
new file mode 100644
index 0000000000000..d7ae7d8a3f745
--- /dev/null
+++ b/paddle/phi/kernels/gpu/matrix_power_kernel.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/matrix_power_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/matrix_power_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    matrix_power, GPU, ALL_LAYOUT, phi::MatrixPowerKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/matrix_rank_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu
new file mode 100644
index 0000000000000..9b889a9b4c006
--- /dev/null
+++ b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu
@@ -0,0 +1,52 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include "paddle/phi/kernels/matrix_rank_kernel.h"
+#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixRankKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      float tol,
+                      bool use_default_tol,
+                      bool hermitian,
+                      DenseTensor* out) {
+  DenseTensor atol_tensor;
+  if (use_default_tol) {
+    atol_tensor = phi::Full<T, Context>(dev_ctx, {1}, static_cast<T>(0));
+  } else {
+    atol_tensor = phi::Full<T, Context>(dev_ctx, {1}, static_cast<T>(tol));
+  }
+  MatrixRankTolKernel<T, Context>(
+      dev_ctx, x, atol_tensor, use_default_tol, hermitian, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(matrix_rank,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MatrixRankKernel,
+                   float,
+                   double) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
new file mode 100644
index 0000000000000..66ba30f7ce694
--- /dev/null
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -0,0 +1,433 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/abs_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+#include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
+
+namespace phi {
+
+template <typename T>
+void GesvdjBatched(const phi::GPUContext& dev_ctx,
+                   int batchSize,
+                   int m,
+                   int n,
+                   int k,
+                   T* A,
+                   T* U,
+                   T* V,
+                   T* S,
+                   int* info,
+                   int thin_UV = 1);
+
+template <typename T>
+void SyevjBatched(const phi::GPUContext& dev_ctx,
+                  int batchSize,
+                  int n,
+                  T* A,
+                  T* W,
+                  int* info);
+
+template <>
+void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
+                          int batchSize,
+                          int m,
+                          int n,
+                          int k,
+                          float* A,
+                          float* U,
+                          float* V,
+                          float* S,
+                          int* info,
+                          int thin_UV) {
+  // do not compute singular vectors
+  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
+  gesvdjInfo_t gesvdj_params = NULL;
+  int lda = m;
+  int ldu = m;
+  int ldt = n;
+  int lwork = 0;
+  auto handle = dev_ctx.cusolver_dn_handle();
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnSgesvdj_bufferSize(handle,
+                                            jobz,
+                                            thin_UV,
+                                            m,
+                                            n,
+                                            A,
+                                            lda,
+                                            S,
+                                            U,
+                                            ldu,
+                                            V,
+                                            ldt,
+                                            &lwork,
+                                            gesvdj_params));
+  auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float));
+  float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
+  int stride_A = lda * n;
+  int stride_U = ldu * (thin_UV ? k : m);
+  int stride_V = ldt * (thin_UV ? k : n);
+  for (int i = 0; i < batchSize; i++) {
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSgesvdj(handle,
+                                                          jobz,
+                                                          thin_UV,
+                                                          m,
+                                                          n,
+                                                          A + stride_A * i,
+                                                          lda,
+                                                          S + k * i,
+                                                          U + stride_U * i,
+                                                          ldu,
+                                                          V + stride_V * i,
+                                                          ldt,
+                                                          workspace_ptr,
+                                                          lwork,
+                                                          info,
+                                                          gesvdj_params));
+    int error_info;
+    paddle::memory::Copy(phi::CPUPlace(),
+                         &error_info,
+                         dev_ctx.GetPlace(),
+                         info,
+                         sizeof(int),
+                         dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        error_info,
+        0,
+        phi::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
+}
+
+template <>
+void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
+                           int batchSize,
+                           int m,
+                           int n,
+                           int k,
+                           double* A,
+                           double* U,
+                           double* V,
+                           double* S,
+                           int* info,
+                           int thin_UV) {
+  // do not compute singular vectors
+  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
+  gesvdjInfo_t gesvdj_params = NULL;
+  int lda = m;
+  int ldu = m;
+  int ldt = n;
+  int lwork = 0;
+  auto handle = dev_ctx.cusolver_dn_handle();
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnDgesvdj_bufferSize(handle,
+                                            jobz,
+                                            thin_UV,
+                                            m,
+                                            n,
+                                            A,
+                                            lda,
+                                            S,
+                                            U,
+                                            ldu,
+                                            V,
+                                            ldt,
+                                            &lwork,
+                                            gesvdj_params));
+  auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double));
+  double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
+  int stride_A = lda * n;
+  int stride_U = ldu * (thin_UV ? k : m);
+  int stride_V = ldt * (thin_UV ? k : n);
+  for (int i = 0; i < batchSize; ++i) {
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDgesvdj(handle,
+                                                          jobz,
+                                                          thin_UV,
+                                                          m,
+                                                          n,
+                                                          A + stride_A * i,
+                                                          lda,
+                                                          S + k * i,
+                                                          U + stride_U * i,
+                                                          ldu,
+                                                          V + stride_V * i,
+                                                          ldt,
+                                                          workspace_ptr,
+                                                          lwork,
+                                                          info,
+                                                          gesvdj_params));
+    // check the error info
+    int error_info;
+    paddle::memory::Copy(phi::CPUPlace(),
+                         &error_info,
+                         dev_ctx.GetPlace(),
+                         info,
+                         sizeof(int),
+                         dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        error_info,
+        0,
+        phi::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
+}
+
+template <>
+void SyevjBatched<float>(const phi::GPUContext& dev_ctx,
+                         int batchSize,
+                         int n,
+                         float* A,
+                         float* W,
+                         int* info) {
+  auto handle = dev_ctx.cusolver_dn_handle();
+  // Compute eigenvalues only
+  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
+  // matrix is saved as column-major in cusolver.
+  // numpy and torch use lower triangle to compute eigenvalues, so here use
+  // upper triangle
+  cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER;
+  int lda = n;
+  int stride_A = lda * n;
+  int lwork = 0;
+  syevjInfo_t params = NULL;
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(&params));
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize(
+      handle, jobz, uplo, n, A, lda, W, &lwork, params));
+  auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float));
+  float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
+  for (int i = 0; i < batchSize; i++) {
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj(handle,
+                                                         jobz,
+                                                         uplo,
+                                                         n,
+                                                         A + stride_A * i,
+                                                         lda,
+                                                         W + n * i,
+                                                         workspace_ptr,
+                                                         lwork,
+                                                         info,
+                                                         params));
+
+    int error_info;
+    paddle::memory::Copy(phi::CPUPlace(),
+                         &error_info,
+                         dev_ctx.GetPlace(),
+                         info,
+                         sizeof(int),
+                         dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        error_info,
+        0,
+        phi::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver eigenvalues is not zero. [%d]",
+            i,
+            error_info));
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDestroySyevjInfo(params));
+}
+
+template <>
+void SyevjBatched<double>(const phi::GPUContext& dev_ctx,
+                          int batchSize,
+                          int n,
+                          double* A,
+                          double* W,
+                          int* info) {
+  auto handle = dev_ctx.cusolver_dn_handle();
+  // Compute eigenvalues only
+  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
+  //  upper triangle of A is stored
+  cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER;
+  int lda = n;
+  int stride_A = lda * n;
+  int lwork = 0;
+  syevjInfo_t params = NULL;
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(&params));
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevj_bufferSize(
+      handle, jobz, uplo, n, A, lda, W, &lwork, params));
+  auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double));
+  double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
+
+  for (int i = 0; i < batchSize; i++) {
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevj(handle,
+                                                         jobz,
+                                                         uplo,
+                                                         n,
+                                                         A + stride_A * i,
+                                                         lda,
+                                                         W + n * i,
+                                                         workspace_ptr,
+                                                         lwork,
+                                                         info,
+                                                         params));
+    int error_info;
+    paddle::memory::Copy(phi::CPUPlace(),
+                         &error_info,
+                         dev_ctx.GetPlace(),
+                         info,
+                         sizeof(int),
+                         dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        error_info,
+        0,
+        phi::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver eigenvalues is not zero. [%d]",
+            i,
+            error_info));
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDestroySyevjInfo(params));
+}
+
+template <typename T, typename Context>
+void MatrixRankTolKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& atol_tensor,
+                         bool use_default_tol,
+                         bool hermitian,
+                         DenseTensor* out) {
+  auto* x_data = x.data<T>();
+  dev_ctx.template Alloc<int64_t>(out);
+
+  auto dim_x = x.dims();
+  auto dim_out = out->dims();
+  int rows = dim_x[dim_x.size() - 2];
+  int cols = dim_x[dim_x.size() - 1];
+  int k = std::min(rows, cols);
+  auto numel = x.numel();
+  int batches = numel / (rows * cols);
+
+  T rtol_T = 0;
+  if (use_default_tol) {
+    rtol_T = std::numeric_limits<T>::epsilon() * std::max(rows, cols);
+  }
+
+  // Must Copy X once, because the gesvdj will destory the content when exit.
+  DenseTensor x_tmp;
+  paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), &x_tmp);
+  auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batches);
+  int* info_ptr = reinterpret_cast<int*>(info->ptr());
+
+  DenseTensor eigenvalue_tensor;
+  eigenvalue_tensor.Resize(detail::GetEigenvalueDim(dim_x, k));
+  auto* eigenvalue_data = dev_ctx.template Alloc<T>(&eigenvalue_tensor);
+
+  if (hermitian) {
+    SyevjBatched<T>(
+        dev_ctx, batches, rows, x_tmp.data<T>(), eigenvalue_data, info_ptr);
+
+    phi::AbsKernel<T, Context>(dev_ctx, eigenvalue_tensor, &eigenvalue_tensor);
+
+  } else {
+    DenseTensor U, VH;
+    U.Resize(detail::GetUDDim(dim_x, k));
+    VH.Resize(detail::GetVHDDim(dim_x, k));
+    auto* u_data = dev_ctx.template Alloc<T>(&U);
+    auto* vh_data = dev_ctx.template Alloc<T>(&VH);
+    GesvdjBatched<T>(dev_ctx,
+                     batches,
+                     cols,
+                     rows,
+                     k,
+                     x_tmp.data<T>(),
+                     vh_data,
+                     u_data,
+                     eigenvalue_data,
+                     info_ptr,
+                     1);
+  }
+
+  DenseTensor max_eigenvalue_tensor;
+  dev_ctx.template Alloc<T>(&max_eigenvalue_tensor);
+  max_eigenvalue_tensor.Resize(detail::RemoveLastDim(eigenvalue_tensor.dims()));
+
+  phi::MaxKernel<T, Context>(dev_ctx,
+                             eigenvalue_tensor,
+                             std::vector<int64_t>{-1},
+                             false,
+                             &max_eigenvalue_tensor);
+
+  DenseTensor temp_rtol_tensor;
+  temp_rtol_tensor =
+      phi::Full<T, Context>(dev_ctx, {1}, static_cast<T>(rtol_T));
+
+  DenseTensor rtol_tensor =
+      phi::Multiply<T>(dev_ctx, temp_rtol_tensor, max_eigenvalue_tensor);
+  DenseTensor tol_tensor;
+  tol_tensor.Resize(dim_out);
+  dev_ctx.template Alloc<T>(&tol_tensor);
+
+  funcs::ElementwiseCompute<GreaterElementFunctor<T>, T, T>(
+      dev_ctx,
+      atol_tensor,
+      rtol_tensor,
+      -1,
+      GreaterElementFunctor<T>(),
+      &tol_tensor);
+
+  tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1));
+
+  DenseTensor compare_result;
+  compare_result.Resize(detail::NewAxisDim(dim_out, k));
+  dev_ctx.template Alloc<int64_t>(&compare_result);
+
+  int axis = -1;
+  funcs::ElementwiseCompute<funcs::GreaterThanFunctor<T, int64_t>, T, int64_t>(
+      dev_ctx,
+      eigenvalue_tensor,
+      tol_tensor,
+      axis,
+      funcs::GreaterThanFunctor<T, int64_t>(),
+      &compare_result);
+
+  phi::SumKernel<int64_t>(dev_ctx,
+                          compare_result,
+                          std::vector<int64_t>{-1},
+                          compare_result.dtype(),
+                          false,
+                          out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(matrix_rank_tol,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MatrixRankTolKernel,
+                   float,
+                   double) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/mode_grad_kernel.cu b/paddle/phi/kernels/gpu/mode_grad_kernel.cu
new file mode 100644
index 0000000000000..43502621c2d3a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/mode_grad_kernel.cu
@@ -0,0 +1,85 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mode_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/mode.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void AssignGradWithAxis(const T* grad_out,
+                                   const int64_t* indices,
+                                   T* grad_in,
+                                   int pre,
+                                   int post,
+                                   int raw_height,
+                                   int k) {
+  // raw_height is the length of topk axis
+  for (int i = blockIdx.x; i < pre; i += gridDim.x) {
+    int base_index = i * post * k;
+    int base_grad = i * post * raw_height;
+    for (int j = threadIdx.x; j < raw_height * post; j += blockDim.x) {
+      grad_in[base_grad + j] = static_cast<T>(0);
+    }
+    __syncthreads();
+    for (int j = threadIdx.x; j < k * post; j += blockDim.x) {
+      int64_t idx_ij = indices[base_index + j];
+      int64_t in_ij = base_grad + (idx_ij * post) + (j % post);
+      grad_in[in_ij] = grad_out[base_index + j];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ModeGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    const DenseTensor& out_grad,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* x_grad) {
+  const auto& in_dims = x.dims();
+  auto out_dims = indices.dims();
+
+  if (axis < 0) axis += in_dims.size();
+  // allocate the cuda memory for the x_grad
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+  const T* out_grad_data = out_grad.data<T>();
+  const int64_t* indices_data = indices.data<int64_t>();
+
+  int pre, n, post;
+  funcs::GetDims(in_dims, axis, &pre, &n, &post);
+
+  // calcluate the block and grid num
+  int block_size = funcs::ComputeBlockSize(post);
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+  int grid_size = std::min(max_blocks, pre);
+  AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+      out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(mode_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ModeGradKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/mode_kernel.cu b/paddle/phi/kernels/gpu/mode_kernel.cu
new file mode 100644
index 0000000000000..629b9722cd6bc
--- /dev/null
+++ b/paddle/phi/kernels/gpu/mode_kernel.cu
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mode_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/mode.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ModeKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int axis,
+                bool keepdim,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  // get the input dims
+  const auto& in_dims = x.dims();
+  // calcluate the real axis
+  if (axis < 0) axis += in_dims.size();
+
+  auto out_dims = out->dims();
+
+  const T* input_data = x.data<T>();
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+
+  if (axis == in_dims.size() - 1) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    funcs::GetModebySort<T>(
+        dev_ctx, &x, input_width, input_height, output_data, indices_data);
+  } else {
+    std::vector<int> trans_axis;
+    for (int i = 0; i < axis; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(axis);
+
+    if (!keepdim) {
+      std::vector<int> tmp_out_shape;
+      for (int i = 0; i < axis; i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      tmp_out_shape.emplace_back(1);
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
+      out->Resize(tmp_out_dim);
+      indices->Resize(tmp_out_dim);
+    }
+
+    DDim trans_shape(in_dims);
+    DDim trans_out_shape(in_dims);
+    for (int i = 0; i < trans_axis.size(); i++) {
+      trans_shape[i] = in_dims[trans_axis[i]];
+      trans_out_shape[i] = in_dims[trans_axis[i]];
+    }
+    trans_out_shape[in_dims.size() - 1] = 1;
+
+    // second step, tranpose the input
+    DenseTensor trans_input;
+    trans_input.Resize(trans_shape);
+    dev_ctx.template Alloc<T>(&trans_input);
+
+    int ndims = trans_axis.size();
+    funcs::TransCompute<Context, T>(
+        ndims, dev_ctx, x, &trans_input, trans_axis);
+    DenseTensor trans_ind;
+    trans_ind.Resize(trans_out_shape);
+    int64_t* trans_ind_data = dev_ctx.template Alloc<int64_t>(&trans_ind);
+
+    DenseTensor trans_out;
+    trans_out.Resize(trans_out_shape);
+    T* trans_out_data = dev_ctx.template Alloc<T>(&trans_out);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
+    const int64_t input_width = trans_shape[trans_shape.size() - 1];
+    funcs::GetModebySort<T>(dev_ctx,
+                            &trans_input,
+                            input_width,
+                            input_height,
+                            trans_out_data,
+                            trans_ind_data);
+    // last step, tranpose back the indices and output
+    funcs::TransCompute<Context, int64_t>(
+        ndims, dev_ctx, trans_ind, indices, trans_axis);
+    funcs::TransCompute<Context, T>(ndims, dev_ctx, trans_out, out, trans_axis);
+    if (!keepdim) {
+      out->Resize(out_dims);
+      indices->Resize(out_dims);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    mode, GPU, ALL_LAYOUT, phi::ModeKernel, float, double, int32_t, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu b/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu
new file mode 100644
index 0000000000000..21576ab608d26
--- /dev/null
+++ b/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu
@@ -0,0 +1,68 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/multiplex_grad_kernel.h"
+
+#include "paddle/phi/api/lib/utils/tensor_utils.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplexGradKernel(const Context& ctx,
+                         const DenseTensor& ids,
+                         const DenseTensor& out_grad,
+                         std::vector<DenseTensor*> ins_grad) {
+  size_t idx = -1UL;
+  for (size_t i = 0; i < ins_grad.size(); i++) {
+    if (ins_grad[i]) {
+      ctx.template Alloc<T>(ins_grad[i]);
+      auto t = phi::EigenVector<T>::Flatten(*ins_grad[i]);
+      t.device(*ctx.eigen_device()) = t.constant(static_cast<T>(0));
+      idx = i;
+    }
+  }
+  if (idx == -1UL) return;
+
+  auto rows = ins_grad[idx]->dims()[0];
+  auto cols = ins_grad[idx]->numel() / rows;
+  DenseTensor index_t_cpu;
+  paddle::framework::TensorCopySync(ids, phi::CPUPlace(), &index_t_cpu);
+  auto* index = index_t_cpu.data<int32_t>();
+  auto stream = ctx.stream();
+  for (auto i = 0; i < rows; i++) {
+    size_t k = static_cast<size_t>(index[i]);
+    if (ins_grad[k]) {
+      paddle::memory::Copy(ctx.GetPlace(),
+                           ins_grad[k]->data<T>() + i * cols,
+                           ctx.GetPlace(),
+                           out_grad.data<T>() + i * cols,
+                           cols * sizeof(T),
+                           stream);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multiplex_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplexGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/multiplex_kernel.cu b/paddle/phi/kernels/gpu/multiplex_kernel.cu
new file mode 100644
index 0000000000000..743448a468666
--- /dev/null
+++ b/paddle/phi/kernels/gpu/multiplex_kernel.cu
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/multiplex_kernel.h"
+
+#include "paddle/phi/api/lib/utils/tensor_utils.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplexKernel(const Context& ctx,
+                     const std::vector<const DenseTensor*>& ins,
+                     const DenseTensor& ids,
+                     DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+  for (size_t i = 0; i < ins.size(); ++i) {
+    PADDLE_ENFORCE_GT(
+        ins[i]->numel(),
+        0,
+        errors::OutOfRange(
+            "indexing will be out of bounds with size 0 for the %d-th input.",
+            i));
+  }
+
+  auto rows = ins[0]->dims()[0];
+  auto cols = ins[0]->numel() / rows;
+  DenseTensor index_t_cpu;
+  paddle::framework::TensorCopySync(ids, phi::CPUPlace(), &index_t_cpu);
+  auto* index = index_t_cpu.data<int32_t>();
+  auto stream = ctx.stream();
+  for (auto i = 0; i < rows; i++) {
+    int32_t k = index[i];
+    PADDLE_ENFORCE_GE(
+        k, 0, errors::PreconditionNotMet("index must be nonnegative."));
+    PADDLE_ENFORCE_LT(static_cast<size_t>(k),
+                      ins.size(),
+                      errors::PreconditionNotMet(
+                          "index exceeds the number of candidate tensors."));
+    paddle::memory::Copy(ctx.GetPlace(),
+                         out->data<T>() + i * cols,
+                         ctx.GetPlace(),
+                         ins[k]->data<T>() + i * cols,
+                         cols * sizeof(T),
+                         stream);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multiplex,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplexKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/one_hot_kernel.cu b/paddle/phi/kernels/gpu/one_hot_kernel.cu
new file mode 100644
index 0000000000000..32c7fa1e85d15
--- /dev/null
+++ b/paddle/phi/kernels/gpu/one_hot_kernel.cu
@@ -0,0 +1,86 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/one_hot_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename InT, typename OutT>
+__global__ void FillOutputKernel(const InT* p_in_data,
+                                 OutT* p_out_data,
+                                 const int64_t numel,
+                                 const int depth) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < numel && p_in_data[idx] >= 0 && p_in_data[idx] < depth) {
+    *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0;
+  }
+}
+
+template <typename DeviceContext, typename InT>
+struct OneHotV2OpCUDAFunctor {
+  const DenseTensor* in_;
+  DenseTensor* out_;
+  const DeviceContext& ctx_;
+  int depth_;
+
+  OneHotV2OpCUDAFunctor(const DenseTensor* in,
+                        DenseTensor* out,
+                        int depth,
+                        const DeviceContext& ctx)
+      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
+
+  template <typename OutT>
+  void apply() const {
+    auto* p_in_data = in_->data<InT>();
+    auto numel = in_->numel();
+    auto* p_out_data = ctx_.template Alloc<OutT>(out_);
+    auto stream = ctx_.stream();
+    funcs::set_constant(ctx_, out_, 0.0);
+
+    FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
+                           PADDLE_CUDA_NUM_THREADS,
+                       PADDLE_CUDA_NUM_THREADS,
+                       0,
+                       stream>>>(p_in_data, p_out_data, numel, depth_);
+  }
+};
+
+template <typename T, typename Context>
+void OneHotRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     int32_t depth,
+                     DataType dtype,
+                     bool allow_out_of_range,
+                     DenseTensor* out) {
+  auto out_dims = out->dims();
+  if (out_dims[out_dims.size() - 1] == -1) {
+    out_dims[out_dims.size() - 1] = depth;
+    out->Resize(out_dims);
+  }
+
+  phi::VisitDataType(
+      dtype, OneHotV2OpCUDAFunctor<Context, T>(&x, out, depth, dev_ctx));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    one_hot_raw, GPU, ALL_LAYOUT, phi::OneHotRawKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
new file mode 100644
index 0000000000000..5ca8f3d73dade
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
@@ -0,0 +1,507 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad3d_grad_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T>
+__global__ void Pad3DGradConstNCDHW(const int in_size,
+                                    T* d_in_data,
+                                    const int num,
+                                    const int channels,
+                                    const int in_depth,
+                                    const int in_height,
+                                    const int in_width,
+                                    const int out_depth,
+                                    const int out_height,
+                                    const int out_width,
+                                    const int pad_front,
+                                    const int pad_top,
+                                    const int pad_left,
+                                    const T* d_out_data) {
+  CUDA_KERNEL_LOOP(in_index, in_size) {
+    const int in_w = in_index % in_width;
+
+    int nc = in_index / in_width;
+    const int in_h = nc % in_height;
+
+    nc /= in_height;
+    const int in_d = nc % in_depth;
+
+    nc /= in_depth;
+
+    const int out_d = in_d + pad_front;
+    const int out_h = in_h + pad_top;
+    const int out_w = in_w + pad_left;
+    d_in_data[in_index] =
+        d_out_data[nc * out_depth * out_height * out_width +
+                   out_d * out_height * out_width + out_h * out_width + out_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradConstNDHWC(const int in_size,
+                                    T* d_in_data,
+                                    const int num,
+                                    const int channels,
+                                    const int in_depth,
+                                    const int in_height,
+                                    const int in_width,
+                                    const int out_depth,
+                                    const int out_height,
+                                    const int out_width,
+                                    const int pad_front,
+                                    const int pad_top,
+                                    const int pad_left,
+                                    const T* d_out_data) {
+  CUDA_KERNEL_LOOP(in_index, in_size) {
+    const int c = in_index % channels;
+    int n = in_index / channels;
+
+    const int in_w = n % in_width;
+    n /= in_width;
+
+    const int in_h = n % in_height;
+    n /= in_height;
+
+    const int in_d = n % in_depth;
+    n /= in_depth;
+
+    const int out_d = in_d + pad_front;
+    const int out_h = in_h + pad_top;
+    const int out_w = in_w + pad_left;
+
+    d_in_data[in_index] =
+        d_out_data[n * out_depth * out_height * out_width * channels +
+                   out_d * out_height * out_width * channels +
+                   out_h * out_width * channels + out_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReflectNCDHW(const int out_size,
+                                      T* d_in_data,
+                                      const int num,
+                                      const int channels,
+                                      const int in_depth,
+                                      const int in_height,
+                                      const int in_width,
+                                      const int out_depth,
+                                      const int out_height,
+                                      const int out_width,
+                                      const int pad_front,
+                                      const int pad_top,
+                                      const int pad_left,
+                                      const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);
+    in_h = max(in_h, -in_h);
+    in_w = max(in_w, -in_w);
+
+    in_d = min(in_d, 2 * in_depth - in_d - 2);
+    in_h = min(in_h, 2 * in_height - in_h - 2);
+    in_w = min(in_w, 2 * in_width - in_w - 2);
+
+    paddle::platform::CudaAtomicAdd(
+        &d_in_data[nc * in_depth * in_height * in_width +
+                   in_d * in_height * in_width + in_h * in_width + in_w],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReflectNDHWC(const int out_size,
+                                      T* d_in_data,
+                                      const int num,
+                                      const int channels,
+                                      const int in_depth,
+                                      const int in_height,
+                                      const int in_width,
+                                      const int out_depth,
+                                      const int out_height,
+                                      const int out_width,
+                                      const int pad_front,
+                                      const int pad_top,
+                                      const int pad_left,
+                                      const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);
+    in_h = max(in_h, -in_h);
+    in_w = max(in_w, -in_w);
+
+    in_d = min(in_d, in_depth * 2 - in_d - 2);
+    in_h = min(in_h, in_height * 2 - in_h - 2);
+    in_w = min(in_w, in_width * 2 - in_w - 2);
+    paddle::platform::CudaAtomicAdd(
+        &d_in_data[n * in_depth * in_height * in_width * channels +
+                   in_d * in_height * in_width * channels +
+                   in_h * in_width * channels + in_w * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReplicateNCDHW(const int out_size,
+                                        T* d_in_data,
+                                        const int num,
+                                        const int channels,
+                                        const int in_depth,
+                                        const int in_height,
+                                        const int in_width,
+                                        const int out_depth,
+                                        const int out_height,
+                                        const int out_width,
+                                        const int pad_front,
+                                        const int pad_top,
+                                        const int pad_left,
+                                        const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    const int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    paddle::platform::CudaAtomicAdd(
+        &d_in_data[nc * in_depth * in_height * in_width +
+                   in_d * in_height * in_width + in_h * in_width + in_w],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReplicateNDHWC(const int out_size,
+                                        T* d_in_data,
+                                        const int num,
+                                        const int channels,
+                                        const int in_depth,
+                                        const int in_height,
+                                        const int in_width,
+                                        const int out_depth,
+                                        const int out_height,
+                                        const int out_width,
+                                        const int pad_front,
+                                        const int pad_top,
+                                        const int pad_left,
+                                        const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    const int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    paddle::platform::CudaAtomicAdd(
+        &d_in_data[n * in_depth * in_height * in_width * channels +
+                   in_d * in_height * in_width * channels +
+                   in_h * in_width * channels + in_w * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradCircularNCDHW(const int out_size,
+                                       T* d_in_data,
+                                       const int num,
+                                       const int channels,
+                                       const int in_depth,
+                                       const int in_height,
+                                       const int in_width,
+                                       const int out_depth,
+                                       const int out_height,
+                                       const int out_width,
+                                       const int pad_front,
+                                       const int pad_top,
+                                       const int pad_left,
+                                       const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    paddle::platform::CudaAtomicAdd(
+        &d_in_data[nc * in_depth * in_height * in_width +
+                   in_d * in_height * in_width + in_h * in_width + in_w],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradCircularNDHWC(const int out_size,
+                                       T* d_in_data,
+                                       const int num,
+                                       const int channels,
+                                       const int in_depth,
+                                       const int in_height,
+                                       const int in_width,
+                                       const int out_depth,
+                                       const int out_height,
+                                       const int out_width,
+                                       const int pad_front,
+                                       const int pad_top,
+                                       const int pad_left,
+                                       const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    paddle::platform::CudaAtomicAdd(
+        &d_in_data[n * in_depth * in_height * in_width * channels +
+                   in_d * in_height * in_width * channels +
+                   in_h * in_width * channels + in_w * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T, typename Context>
+void Pad3dGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& out_grad,
+                     const ScalarArray& paddings,
+                     const std::string& mode,
+                     float pad_value,
+                     const std::string& data_format,
+                     DenseTensor* x_grad) {
+  std::vector<int64_t> pads = paddings.GetData();
+  auto* d_out = &out_grad;
+  auto* d_in = x_grad;
+  auto d_in_dims = d_in->dims();
+  auto d_out_dims = d_out->dims();
+  const T* d_out_data = d_out->data<T>();
+  T* d_in_data = dev_ctx.template Alloc<T>(d_in);
+
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, d_in, static_cast<T>(0));
+
+  const int pad_left = pads[0];
+  const int pad_top = pads[2];
+  const int pad_front = pads[4];
+
+  const int num = d_in_dims[0];
+
+  auto stream = dev_ctx.stream();
+  int block = PADDLE_CUDA_NUM_THREADS;
+  const int out_size = d_out->numel();
+  const int in_size = d_in->numel();
+  int grid = (out_size + block - 1) / block;
+
+  if (data_format == "NCDHW") {
+    const int channels = d_in_dims[1];
+    const int in_depth = d_in_dims[2];
+    const int in_height = d_in_dims[3];
+    const int in_width = d_in_dims[4];
+    const int out_depth = d_out_dims[2];
+    const int out_height = d_out_dims[3];
+    const int out_width = d_out_dims[4];
+
+    if (mode == "reflect") {
+      Pad3DGradReflectNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                           d_in_data,
+                                                           num,
+                                                           channels,
+                                                           in_depth,
+                                                           in_height,
+                                                           in_width,
+                                                           out_depth,
+                                                           out_height,
+                                                           out_width,
+                                                           pad_front,
+                                                           pad_top,
+                                                           pad_left,
+                                                           d_out_data);
+    } else if (mode == "replicate") {
+      Pad3DGradReplicateNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                             d_in_data,
+                                                             num,
+                                                             channels,
+                                                             in_depth,
+                                                             in_height,
+                                                             in_width,
+                                                             out_depth,
+                                                             out_height,
+                                                             out_width,
+                                                             pad_front,
+                                                             pad_top,
+                                                             pad_left,
+                                                             d_out_data);
+    } else if (mode == "circular") {
+      Pad3DGradCircularNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                            d_in_data,
+                                                            num,
+                                                            channels,
+                                                            in_depth,
+                                                            in_height,
+                                                            in_width,
+                                                            out_depth,
+                                                            out_height,
+                                                            out_width,
+                                                            pad_front,
+                                                            pad_top,
+                                                            pad_left,
+                                                            d_out_data);
+    } else {
+      grid = (in_size + block - 1) / block;
+      Pad3DGradConstNCDHW<T><<<grid, block, 0, stream>>>(in_size,
+                                                         d_in_data,
+                                                         num,
+                                                         channels,
+                                                         in_depth,
+                                                         in_height,
+                                                         in_width,
+                                                         out_depth,
+                                                         out_height,
+                                                         out_width,
+                                                         pad_front,
+                                                         pad_top,
+                                                         pad_left,
+                                                         d_out_data);
+    }
+  } else {
+    const int channels = d_in_dims[4];
+    const int in_depth = d_in_dims[1];
+    const int in_height = d_in_dims[2];
+    const int in_width = d_in_dims[3];
+    const int out_depth = d_out_dims[1];
+    const int out_height = d_out_dims[2];
+    const int out_width = d_out_dims[3];
+    if (mode == "reflect") {
+      Pad3DGradReflectNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                           d_in_data,
+                                                           num,
+                                                           channels,
+                                                           in_depth,
+                                                           in_height,
+                                                           in_width,
+                                                           out_depth,
+                                                           out_height,
+                                                           out_width,
+                                                           pad_front,
+                                                           pad_top,
+                                                           pad_left,
+                                                           d_out_data);
+    } else if (mode == "replicate") {
+      Pad3DGradReplicateNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                             d_in_data,
+                                                             num,
+                                                             channels,
+                                                             in_depth,
+                                                             in_height,
+                                                             in_width,
+                                                             out_depth,
+                                                             out_height,
+                                                             out_width,
+                                                             pad_front,
+                                                             pad_top,
+                                                             pad_left,
+                                                             d_out_data);
+    } else if (mode == "circular") {
+      Pad3DGradCircularNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                            d_in_data,
+                                                            num,
+                                                            channels,
+                                                            in_depth,
+                                                            in_height,
+                                                            in_width,
+                                                            out_depth,
+                                                            out_height,
+                                                            out_width,
+                                                            pad_front,
+                                                            pad_top,
+                                                            pad_left,
+                                                            d_out_data);
+    } else {
+      grid = (in_size + block - 1) / block;
+      Pad3DGradConstNDHWC<T><<<grid, block, 0, stream>>>(in_size,
+                                                         d_in_data,
+                                                         num,
+                                                         channels,
+                                                         in_depth,
+                                                         in_height,
+                                                         in_width,
+                                                         out_depth,
+                                                         out_height,
+                                                         out_width,
+                                                         pad_front,
+                                                         pad_top,
+                                                         pad_left,
+                                                         d_out_data);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    pad3d_grad, GPU, ALL_LAYOUT, phi::Pad3dGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/pad3d_kernel.cu b/paddle/phi/kernels/gpu/pad3d_kernel.cu
new file mode 100644
index 0000000000000..2cef77cc0eef9
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pad3d_kernel.cu
@@ -0,0 +1,588 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad3d_kernel.h"
+
+#include <algorithm>
+
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T>
+__global__ void Pad3DConstNCDHW(const int nthreads,
+                                const T* in_data,
+                                const int num,
+                                const int channels,
+                                const int in_depth,
+                                const int in_height,
+                                const int in_width,
+                                const int out_depth,
+                                const int out_height,
+                                const int out_width,
+                                const int pad_front,
+                                const int pad_top,
+                                const int pad_left,
+                                T value,
+                                T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    out_data[index] =
+        (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+         in_h >= in_height || in_w >= in_width)
+            ? value
+            : in_data[nc * in_depth * in_height * in_width +
+                      in_d * in_height * in_width + in_h * in_width + in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DConstNDHWC(const int nthreads,
+                                const T* in_data,
+                                const int num,
+                                const int channels,
+                                const int in_depth,
+                                const int in_height,
+                                const int in_width,
+                                const int out_depth,
+                                const int out_height,
+                                const int out_width,
+                                const int pad_front,
+                                const int pad_top,
+                                const int pad_left,
+                                T value,
+                                T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+    const int in_d = out_d - pad_front;
+    const int in_h = out_h - pad_top;
+    const int in_w = out_w - pad_left;
+
+    out_data[index] =
+        (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+         in_h >= in_height || in_w >= in_width)
+            ? value
+            : in_data[n * in_depth * in_height * in_width * channels +
+                      in_d * in_height * in_width * channels +
+                      in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReflectNCDHW(const int nthreads,
+                                  const T* in_data,
+                                  const int num,
+                                  const int channels,
+                                  const int in_depth,
+                                  const int in_height,
+                                  const int in_width,
+                                  const int out_depth,
+                                  const int out_height,
+                                  const int out_width,
+                                  const int pad_front,
+                                  const int pad_top,
+                                  const int pad_left,
+                                  T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);                     // reflect by 0
+    in_d = min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
+    in_h = max(in_h, -in_h);                     // reflect by 0
+    in_h = min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+    in_w = max(in_w, -in_w);                     // reflect by 0
+    in_w = min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+    out_data[index] =
+        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
+                    in_width +
+                in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReflectNDHWC(const int nthreads,
+                                  const T* in_data,
+                                  const int num,
+                                  const int channels,
+                                  const int in_depth,
+                                  const int in_height,
+                                  const int in_width,
+                                  const int out_depth,
+                                  const int out_height,
+                                  const int out_width,
+                                  const int pad_front,
+                                  const int pad_top,
+                                  const int pad_left,
+                                  T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);
+    in_d = min(in_d, 2 * in_depth - in_d - 2);
+    in_h = max(in_h, -in_h);
+    in_h = min(in_h, 2 * in_height - in_h - 2);
+    in_w = max(in_w, -in_w);
+    in_w = min(in_w, 2 * in_width - in_w - 2);
+
+    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
+                              in_d * in_height * in_width * channels +
+                              in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReplicateNCDHW(const int nthreads,
+                                    const T* in_data,
+                                    const int num,
+                                    const int channels,
+                                    const int in_depth,
+                                    const int in_height,
+                                    const int in_width,
+                                    const int out_depth,
+                                    const int out_height,
+                                    const int out_width,
+                                    const int pad_front,
+                                    const int pad_top,
+                                    const int pad_left,
+                                    T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    out_data[index] =
+        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
+                    in_width +
+                in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReplicateNDHWC(const int nthreads,
+                                    const T* in_data,
+                                    const int num,
+                                    const int channels,
+                                    const int in_depth,
+                                    const int in_height,
+                                    const int in_width,
+                                    const int out_depth,
+                                    const int out_height,
+                                    const int out_width,
+                                    const int pad_front,
+                                    const int pad_top,
+                                    const int pad_left,
+                                    T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
+                              in_d * in_height * in_width * channels +
+                              in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DCircularNCDHW(const int nthreads,
+                                   const T* in_data,
+                                   const int num,
+                                   const int channels,
+                                   const int in_depth,
+                                   const int in_height,
+                                   const int in_width,
+                                   const int out_depth,
+                                   const int out_height,
+                                   const int out_width,
+                                   const int pad_front,
+                                   const int pad_top,
+                                   const int pad_left,
+                                   T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    out_data[index] =
+        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
+                    in_width +
+                in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DCircularNDHWC(const int nthreads,
+                                   const T* in_data,
+                                   const int num,
+                                   const int channels,
+                                   const int in_depth,
+                                   const int in_height,
+                                   const int in_width,
+                                   const int out_depth,
+                                   const int out_height,
+                                   const int out_width,
+                                   const int pad_front,
+                                   const int pad_top,
+                                   const int pad_left,
+                                   T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
+                              in_d * in_height * in_width * channels +
+                              in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T, typename Context>
+void Pad3dKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const ScalarArray& paddings,
+                 const std::string& mode,
+                 float pad_value,
+                 const std::string& data_format,
+                 DenseTensor* out) {
+  std::vector<int64_t> pads = paddings.GetData();
+
+  auto in_dims = x.dims();
+  const T* in_data = x.data<T>();
+  auto out_dims = out->dims();
+  T value = static_cast<T>(pad_value);
+
+  if (data_format == "NCDHW") {
+    out_dims[0] = in_dims[0];
+    out_dims[1] = in_dims[1];
+    out_dims[2] = in_dims[2] + pads[4] + pads[5];
+    out_dims[3] = in_dims[3] + pads[2] + pads[3];
+    out_dims[4] = in_dims[4] + pads[0] + pads[1];
+  } else {
+    out_dims[0] = in_dims[0];
+    out_dims[1] = in_dims[1] + pads[4] + pads[5];
+    out_dims[2] = in_dims[2] + pads[2] + pads[3];
+    out_dims[3] = in_dims[3] + pads[0] + pads[1];
+    out_dims[4] = in_dims[4];
+  }
+  out->Resize(out_dims);
+  T* out_data = dev_ctx.template Alloc<T>(out);
+
+  int channels = in_dims[1];
+  int in_depth = in_dims[2];
+  int in_height = in_dims[3];
+  int in_width = in_dims[4];
+  int out_depth = out_dims[2];
+  int out_height = out_dims[3];
+  int out_width = out_dims[4];
+  if (data_format == "NDHWC") {
+    channels = in_dims[4];
+    in_depth = in_dims[1];
+    in_height = in_dims[2];
+    in_width = in_dims[3];
+    out_depth = out_dims[1];
+    out_height = out_dims[2];
+    out_width = out_dims[3];
+  }
+
+  if (mode == "reflect") {
+    PADDLE_ENFORCE_GT(
+        in_depth,
+        pads[4],
+        errors::InvalidArgument("The depth of Input(X)'s dimension should be "
+                                "greater than pad_front"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_front(%d).",
+                                in_depth,
+                                pads[4]));
+    PADDLE_ENFORCE_GT(
+        in_depth,
+        pads[5],
+        errors::InvalidArgument("The depth of Input(X)'s dimension should be "
+                                "greater than pad_back"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_back(%d).",
+                                in_depth,
+                                pads[5]));
+
+    PADDLE_ENFORCE_GT(
+        in_height,
+        pads[2],
+        errors::InvalidArgument("The height of Input(X)'s dimension should be "
+                                "greater than pad_top"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_top(%d).",
+                                in_height,
+                                pads[2]));
+    PADDLE_ENFORCE_GT(
+        in_height,
+        pads[3],
+        errors::InvalidArgument("The height of Input(X)'s dimension should be "
+                                "greater than pad_bottom"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_bottom(%d).",
+                                in_height,
+                                pads[3]));
+
+    PADDLE_ENFORCE_GT(
+        in_width,
+        pads[0],
+        errors::InvalidArgument("The width of Input(X)'s dimension should be "
+                                "greater than pad_left"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_left(%d).",
+                                in_width,
+                                pads[0]));
+    PADDLE_ENFORCE_GT(
+        in_width,
+        pads[1],
+        errors::InvalidArgument("The width of Input(X)'s dimension should be "
+                                "greater than pad_right"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_right(%d).",
+                                in_width,
+                                pads[1]));
+  } else if (mode == "circular" || mode == "replicate") {
+    PADDLE_ENFORCE_NE(in_depth * in_height * in_width,
+                      0,
+                      errors::InvalidArgument(
+                          "The input tensor size can not be 0 for circular "
+                          "or replicate padding mode."));
+  }
+
+  const int pad_left = pads[0];
+  const int pad_top = pads[2];
+  const int pad_front = pads[4];
+  const int num = in_dims[0];
+
+  auto stream = dev_ctx.stream();
+  int block = PADDLE_CUDA_NUM_THREADS;
+  const int out_size = out->numel();
+  int grid = (out_size + block - 1) / block;
+
+  if (data_format == "NCDHW") {
+    if (mode == "reflect") {
+      Pad3DReflectNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                       in_data,
+                                                       num,
+                                                       channels,
+                                                       in_depth,
+                                                       in_height,
+                                                       in_width,
+                                                       out_depth,
+                                                       out_height,
+                                                       out_width,
+                                                       pad_front,
+                                                       pad_top,
+                                                       pad_left,
+                                                       out_data);
+    } else if (mode == "replicate") {
+      Pad3DReplicateNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                         in_data,
+                                                         num,
+                                                         channels,
+                                                         in_depth,
+                                                         in_height,
+                                                         in_width,
+                                                         out_depth,
+                                                         out_height,
+                                                         out_width,
+                                                         pad_front,
+                                                         pad_top,
+                                                         pad_left,
+                                                         out_data);
+    } else if (mode == "circular") {
+      Pad3DCircularNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                        in_data,
+                                                        num,
+                                                        channels,
+                                                        in_depth,
+                                                        in_height,
+                                                        in_width,
+                                                        out_depth,
+                                                        out_height,
+                                                        out_width,
+                                                        pad_front,
+                                                        pad_top,
+                                                        pad_left,
+                                                        out_data);
+    } else {
+      Pad3DConstNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                     in_data,
+                                                     num,
+                                                     channels,
+                                                     in_depth,
+                                                     in_height,
+                                                     in_width,
+                                                     out_depth,
+                                                     out_height,
+                                                     out_width,
+                                                     pad_front,
+                                                     pad_top,
+                                                     pad_left,
+                                                     value,
+                                                     out_data);
+    }
+  } else {
+    if (mode == "reflect") {
+      Pad3DReflectNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                       in_data,
+                                                       num,
+                                                       channels,
+                                                       in_depth,
+                                                       in_height,
+                                                       in_width,
+                                                       out_depth,
+                                                       out_height,
+                                                       out_width,
+                                                       pad_front,
+                                                       pad_top,
+                                                       pad_left,
+                                                       out_data);
+    } else if (mode == "replicate") {
+      Pad3DReplicateNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                         in_data,
+                                                         num,
+                                                         channels,
+                                                         in_depth,
+                                                         in_height,
+                                                         in_width,
+                                                         out_depth,
+                                                         out_height,
+                                                         out_width,
+                                                         pad_front,
+                                                         pad_top,
+                                                         pad_left,
+                                                         out_data);
+    } else if (mode == "circular") {
+      Pad3DCircularNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                        in_data,
+                                                        num,
+                                                        channels,
+                                                        in_depth,
+                                                        in_height,
+                                                        in_width,
+                                                        out_depth,
+                                                        out_height,
+                                                        out_width,
+                                                        pad_front,
+                                                        pad_top,
+                                                        pad_left,
+                                                        out_data);
+    } else {
+      Pad3DConstNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                     in_data,
+                                                     num,
+                                                     channels,
+                                                     in_depth,
+                                                     in_height,
+                                                     in_width,
+                                                     out_depth,
+                                                     out_height,
+                                                     out_width,
+                                                     pad_front,
+                                                     pad_top,
+                                                     pad_left,
+                                                     value,
+                                                     out_data);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(pad3d,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Pad3dKernel,
+                   phi::dtype::float16,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/pool_grad_kernel.cu b/paddle/phi/kernels/gpu/pool_grad_kernel.cu
new file mode 100644
index 0000000000000..a5ab6a1ccd49f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pool_grad_kernel.cu
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pool_grad_kernel.h"
+
+#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h"
+
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(pool2d_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Pool2dGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(pool2d_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Pool2dDoubleGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(max_pool2d_with_index_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool2dWithIndexGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
+
+PD_REGISTER_KERNEL(pool3d_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Pool3dGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(max_pool3d_with_index_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool3dWithIndexGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/gpu/pool_kernel.cu b/paddle/phi/kernels/gpu/pool_kernel.cu
new file mode 100644
index 0000000000000..e8641395bef92
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pool_kernel.cu
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pool_kernel.h"
+
+#include "paddle/phi/kernels/impl/pool_kernel_impl.h"
+
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(pool2d,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Pool2dKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(max_pool2d_with_index,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool2dWithIndexKernel,
+                   float,
+                   double) {
+  kernel->OutputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
+
+PD_REGISTER_KERNEL(pool3d,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Pool3dKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(max_pool3d_with_index,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool3dWithIndexKernel,
+                   float,
+                   double) {
+  kernel->OutputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/gpu/prelu_funcs.h b/paddle/phi/kernels/gpu/prelu_funcs.h
new file mode 100644
index 0000000000000..76ee9439a2050
--- /dev/null
+++ b/paddle/phi/kernels/gpu/prelu_funcs.h
@@ -0,0 +1,183 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+#define CUDA_NUM_THREADS 1024
+
+inline static int PADDLE_GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+template <typename T>
+__global__ void PReluChannelFirstWiseKernel(const T *input,
+                                            const T *alpha,
+                                            T *output,
+                                            size_t channel_num,
+                                            size_t plane_size,
+                                            size_t numel) {
+  CUDA_KERNEL_LOOP(index, numel) {
+    size_t temp = index / plane_size;
+    size_t channel_index = temp % channel_num;
+    T scale = alpha[channel_index];
+    T x = input[index];
+    T zero = static_cast<T>(0);
+    output[index] = (x > zero) ? x : scale * x;
+  }
+}
+
+template <typename T>
+__global__ void PReluChannelLastWiseKernel(const T *input,
+                                           const T *alpha,
+                                           T *output,
+                                           size_t channel_num,
+                                           size_t numel) {
+  CUDA_KERNEL_LOOP(index, numel) {
+    size_t channel_index = index % channel_num;
+    T scale = alpha[channel_index];
+    T x = input[index];
+    T zero = static_cast<T>(0);
+    output[index] = (x > zero) ? x : scale * x;
+  }
+}
+
+template <typename T>
+__global__ void PReluElementWiseKernel(const T *input,
+                                       const T *alpha,
+                                       T *output,
+                                       size_t spatial_size,
+                                       size_t numel) {
+  CUDA_KERNEL_LOOP(index, numel) {
+    size_t element_index = index % spatial_size;
+    T scale = alpha[element_index];
+    T x = input[index];
+    T zero = static_cast<T>(0);
+    output[index] = (x > zero) ? x : scale * x;
+  }
+}
+
+template <typename T>
+__global__ void PReluScalarKernel(const T *input,
+                                  const T *alpha,
+                                  T *output,
+                                  size_t numel) {
+  T scale = alpha[0];
+  CUDA_KERNEL_LOOP(index, numel) {
+    T x = input[index];
+    T zero = static_cast<T>(0);
+    output[index] = (x > zero) ? x : scale * x;
+  }
+}
+
+template <typename T>
+class PreluChannelWiseDirectCUDAFunctor {
+ public:
+  void operator()(gpuStream_t stream,
+                  const T *input,
+                  const T *alpha,
+                  T *output,
+                  size_t batch_size,
+                  size_t channel,
+                  bool channel_last,
+                  size_t numel);
+};
+
+template <typename T>
+class PreluElementWiseDirectCUDAFunctor {
+ public:
+  void operator()(gpuStream_t stream,
+                  const T *input,
+                  const T *alpha,
+                  T *output,
+                  size_t batch_size,
+                  size_t numel);
+};
+
+template <typename T>
+class PreluScalarDirectCUDAFunctor {
+ public:
+  void operator()(gpuStream_t stream,
+                  const T *input,
+                  const T *alpha,
+                  T *output,
+                  size_t numel);
+};
+
+template <typename T>
+void PreluChannelWiseDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
+                                                      const T *input,
+                                                      const T *alpha,
+                                                      T *output,
+                                                      size_t batch_size,
+                                                      size_t channel,
+                                                      bool channel_last,
+                                                      size_t numel) {
+  if (channel_last) {
+    PReluChannelLastWiseKernel<<<PADDLE_GET_BLOCKS(numel),
+                                 CUDA_NUM_THREADS,
+                                 0,
+                                 stream>>>(
+        input, alpha, output, channel, numel);
+  } else {
+    PReluChannelFirstWiseKernel<<<PADDLE_GET_BLOCKS(numel),
+                                  CUDA_NUM_THREADS,
+                                  0,
+                                  stream>>>(
+        input, alpha, output, channel, numel / batch_size / channel, numel);
+  }
+}
+
+template <typename T>
+void PreluElementWiseDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
+                                                      const T *input,
+                                                      const T *alpha,
+                                                      T *output,
+                                                      size_t batch_size,
+                                                      size_t numel) {
+  PReluElementWiseKernel<<<PADDLE_GET_BLOCKS(numel),
+                           CUDA_NUM_THREADS,
+                           0,
+                           stream>>>(
+      input, alpha, output, numel / batch_size, numel);
+}
+
+template <typename T>
+void PreluScalarDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
+                                                 const T *input,
+                                                 const T *alpha,
+                                                 T *output,
+                                                 size_t numel) {
+  PReluScalarKernel<<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, numel);
+}
+
+template class PreluChannelWiseDirectCUDAFunctor<float>;
+template class PreluChannelWiseDirectCUDAFunctor<phi::dtype::float16>;
+template class PreluChannelWiseDirectCUDAFunctor<double>;
+
+template class PreluElementWiseDirectCUDAFunctor<float>;
+template class PreluElementWiseDirectCUDAFunctor<phi::dtype::float16>;
+template class PreluElementWiseDirectCUDAFunctor<double>;
+
+template class PreluScalarDirectCUDAFunctor<float>;
+template class PreluScalarDirectCUDAFunctor<phi::dtype::float16>;
+template class PreluScalarDirectCUDAFunctor<double>;
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/prelu_grad_kernel.cu b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu
new file mode 100644
index 0000000000000..d8661268e82c3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu
@@ -0,0 +1,183 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/prelu_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/gpu/prelu_funcs.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
+namespace phi {
+
+enum PRELU_MODE { Element, ChannelFirst, ChannelLast, PRELU_Scalar };
+
+template <typename T>
+__global__ void PReluOpGradKernel(const T* x_ptr,
+                                  const T* alpha_ptr,
+                                  const T* out_grad_ptr,
+                                  T* x_grad_ptr,
+                                  T* alpha_grad_ptr,
+                                  size_t channel_num,
+                                  size_t plane_size,
+                                  size_t spatial_size,
+                                  size_t numel,
+                                  PRELU_MODE mode) {
+  CUDA_KERNEL_LOOP(index, numel) {
+    T scale;
+    if (mode == Element) {
+      size_t element_index = index % spatial_size;
+      scale = alpha_ptr[element_index];
+    } else if (mode == ChannelFirst) {
+      size_t temp = index / plane_size;
+      size_t channel_index = temp % channel_num;
+      scale = alpha_ptr[channel_index];
+    } else if (mode == ChannelLast) {
+      size_t channel_index = index % channel_num;
+      scale = alpha_ptr[channel_index];
+    } else {
+      scale = alpha_ptr[0];
+    }
+    T x = x_ptr[index];
+    T out_grad = out_grad_ptr[index];
+    T zero = static_cast<T>(0);
+    if (x_grad_ptr != nullptr)
+      x_grad_ptr[index] = (x > zero) ? out_grad : scale * out_grad;
+    if (alpha_grad_ptr != nullptr)
+      alpha_grad_ptr[index] = (x > zero) ? zero : x * out_grad;
+  }
+}
+
+template <typename T>
+class PreluOpGradFunctor {
+ public:
+  void operator()(gpuStream_t stream,
+                  const T* x,
+                  const T* alpha,
+                  const T* out_grad,
+                  T* x_grad,
+                  T* alpha_grad,
+                  const DDim& input_dims,
+                  PRELU_MODE mode) {
+    size_t numel = 1;
+    for (size_t i = 0; i < input_dims.size(); ++i) {
+      numel *= input_dims[i];
+    }
+    size_t plane_size = numel / input_dims[0] / input_dims[1];
+    size_t spatial_size = numel / input_dims[0];
+    size_t channel =
+        mode == ChannelLast ? input_dims[input_dims.size() - 1] : input_dims[1];
+
+    PReluOpGradKernel<
+        T><<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
+        x,
+        alpha,
+        out_grad,
+        x_grad,
+        alpha_grad,
+        channel,
+        plane_size,
+        spatial_size,
+        numel,
+        mode);
+  }
+};
+
+template <typename T, typename Context>
+void PReluGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& alpha,
+                     const DenseTensor& out_grad,
+                     const std::string& mode,
+                     const std::string& data_format,
+                     DenseTensor* x_grad,
+                     DenseTensor* alpha_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+
+  const T* x_ptr = x.data<T>();
+  const T* alpha_ptr = alpha.data<T>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  T* x_grad_ptr = x_grad ? dev_ctx.template Alloc<T>(x_grad) : nullptr;
+  T* alpha_grad_ptr =
+      alpha_grad ? dev_ctx.template Alloc<T>(alpha_grad) : nullptr;
+
+  if (!x_grad && !alpha_grad) return;
+
+  int numel = x.numel();
+  auto dim = x.dims();
+  auto x_rank = dim.size();
+  std::vector<int> input_shape = phi::vectorize<int>(dim);
+  auto stream = dev_ctx.stream();
+
+  T* alpha_grad_tmp_ptr;
+  DenseTensor alpha_grad_tmp;
+  if (alpha_grad_ptr == nullptr) {
+    alpha_grad_tmp_ptr = alpha_grad_ptr;
+  } else {
+    DenseTensorMeta alpha_grad_meta(
+        alpha_grad->dtype(), dim, alpha_grad->layout());
+    alpha_grad_tmp = phi::Empty(dev_ctx, std::move(alpha_grad_meta));
+    alpha_grad_tmp_ptr = alpha_grad_tmp.data<T>();
+  }
+
+  PRELU_MODE m;
+  bool channel_last = false;
+  if (mode == "element") {
+    m = Element;
+  } else if (mode == "channel") {
+    channel_last = data_format == "NHWC";
+    m = channel_last ? ChannelLast : ChannelFirst;
+  } else {
+    m = PRELU_Scalar;
+  }
+  PreluOpGradFunctor<T> prelu_grad;
+  prelu_grad(stream,
+             x_ptr,
+             alpha_ptr,
+             out_grad_ptr,
+             x_grad_ptr,
+             alpha_grad_tmp_ptr,
+             dim,
+             m);
+
+  if (alpha_grad_tmp_ptr == nullptr) return;
+
+  std::vector<int> reduce_dims;
+  for (size_t i = 0; i < dim.size(); i++) {
+    if (mode == "channel" && !channel_last && i == 1) continue;
+    if (mode == "channel" && channel_last && i == dim.size() - 1) continue;
+    if (mode == "element" && i != 0) continue;
+    reduce_dims.push_back(i);
+  }
+
+  phi::funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      static_cast<const phi::GPUContext&>(dev_ctx),
+      alpha_grad_tmp,
+      alpha_grad,
+      kps::IdentityFunctor<T>(),
+      reduce_dims);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(prelu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PReluGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/prelu_kernel.cu b/paddle/phi/kernels/gpu/prelu_kernel.cu
new file mode 100644
index 0000000000000..8255a7ba2ed96
--- /dev/null
+++ b/paddle/phi/kernels/gpu/prelu_kernel.cu
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/prelu_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/prelu_funcs.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PReluKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& alpha,
+                 const std::string& mode,
+                 const std::string& data_format,
+                 DenseTensor* out) {
+  const T* x_ptr = x.data<T>();
+  T* o_ptr = dev_ctx.template Alloc<T>(out);
+
+  const T* alpha_ptr = alpha.data<T>();
+  int numel = x.numel();
+  auto dim = x.dims();
+  auto x_rank = dim.size();
+
+  VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1] << ", dim["
+          << x_rank - 1 << "]:" << dim[x_rank - 1] << ", numel:" << numel;
+
+  if (mode == "channel") {
+    bool channel_last = data_format == "NHWC";
+    size_t channel = channel_last ? dim[x_rank - 1] : dim[1];
+    PreluChannelWiseDirectCUDAFunctor<T> prelu_channel_wise;
+    prelu_channel_wise(dev_ctx.stream(),
+                       x_ptr,
+                       alpha_ptr,
+                       o_ptr,
+                       dim[0],
+                       channel,
+                       channel_last,
+                       numel);
+  } else if (mode == "element") {
+    PreluElementWiseDirectCUDAFunctor<T> prelu_element_wise;
+    prelu_element_wise(
+        dev_ctx.stream(), x_ptr, alpha_ptr, o_ptr, dim[0], numel);
+  } else {
+    PreluScalarDirectCUDAFunctor<T> prelu_scalar;
+    prelu_scalar(dev_ctx.stream(), x_ptr, alpha_ptr, o_ptr, numel);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(prelu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PReluKernel,
+                   float,
+                   phi::dtype::float16,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
new file mode 100644
index 0000000000000..6745653eba7d1
--- /dev/null
+++ b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
@@ -0,0 +1,193 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/psroi_pool_kernel.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPUPSROIPoolBackward(const int nthreads,
+                                     const T* input_rois,
+                                     const T* dout_data,
+                                     const float spatial_scale,
+                                     const int input_channels,
+                                     const int height,
+                                     const int width,
+                                     const int output_channels,
+                                     const int pooled_height,
+                                     const int pooled_width,
+                                     const int* rois_batch_id_data,
+                                     T* dx_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    // The output is in order (n, c, ph, pw)
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % output_channels;
+    int n = i / pooled_width / pooled_height / output_channels;
+
+    // set roi_batch_id
+    int roi_batch_id = rois_batch_id_data[n];
+    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+    int input_offset =
+        (roi_batch_id * input_channels + input_channel) * height * width;
+    T* offset_dx_data = dx_data + input_offset;
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+    // Force too small ROIs to be 1x1
+    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0), width);
+    wend = min(max(wend, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    // Accumulate diff_val into input data
+    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+    T diff_val = is_empty ? 0. : dout_data[i] / bin_area;
+    for (int ih = hstart; ih < hend; ++ih) {
+      for (int iw = wstart; iw < wend; ++iw) {
+        int input_index = ih * width + iw;
+        paddle::platform::CudaAtomicAdd(offset_dx_data + input_index, diff_val);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void PsroiPoolGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& rois,
+                         paddle::optional<const DenseTensor&> rois_num,
+                         const DenseTensor& dout,
+                         int pooled_height,
+                         int pooled_width,
+                         int output_channels,
+                         float spatial_scale,
+                         DenseTensor* dx) {
+  int rois_num_t = rois.dims()[0];
+  int input_channels = x.dims()[1];
+  int height = x.dims()[2];
+  int width = x.dims()[3];
+
+  if (dx) {
+    // set roi batch id
+    DenseTensor rois_batch_id_list;
+    rois_batch_id_list.Resize({rois_num_t});
+    int* rois_batch_id_data = ctx.template HostAlloc<int>(&rois_batch_id_list);
+    int rois_batch_size;
+    if (rois_num.get_ptr()) {
+      rois_batch_size = rois_num->numel();
+      std::vector<int> rois_num_list(rois_batch_size);
+      paddle::memory::Copy(CPUPlace(),
+                           rois_num_list.data(),
+                           ctx.GetPlace(),
+                           rois_num->data<int>(),
+                           sizeof(int) * rois_batch_size,
+                           0);
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_list[n]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+        start += rois_num_list[n];
+      }
+    } else {
+      auto rois_lod = rois.lod().back();
+      rois_batch_size = rois_lod.size() - 1;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+      }
+    }
+
+    DenseTensor rois_batch_id_list_gpu;
+    Copy(ctx,
+         rois_batch_id_list,
+         ctx.GetPlace(),
+         false,
+         &rois_batch_id_list_gpu);
+
+    ctx.template Alloc<T>(dx);
+    funcs::SetConstant<Context, T> set_zero;
+    set_zero(ctx, dx, static_cast<T>(0));
+
+    int dout_size = dout.numel();
+    int blocks = NumBlocks(dout_size);
+    int threads = kNumCUDAThreads;
+
+    if (dout_size > 0) {
+      GPUPSROIPoolBackward<T><<<blocks, threads, 0, ctx.stream()>>>(
+          dout_size,
+          rois.data<T>(),
+          dout.data<T>(),
+          spatial_scale,
+          input_channels,
+          height,
+          width,
+          output_channels,
+          pooled_height,
+          pooled_width,
+          rois_batch_id_list_gpu.data<int>(),
+          ctx.template Alloc<T>(dx));
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    psroi_pool_grad, GPU, ALL_LAYOUT, phi::PsroiPoolGradKernel, float, double) {
+  kernel->InputAt(2).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
new file mode 100644
index 0000000000000..8f9be001ba763
--- /dev/null
+++ b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
@@ -0,0 +1,231 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/psroi_pool_kernel.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPUPSROIPoolForward(const int nthreads,
+                                    const T* input_data,
+                                    const T* input_rois,
+                                    const float spatial_scale,
+                                    const int input_channels,
+                                    const int height,
+                                    const int width,
+                                    const int output_channels,
+                                    const int pooled_height,
+                                    const int pooled_width,
+                                    const int* rois_batch_id_data,
+                                    T* output_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    // The output is in order (n, c, ph, pw)
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % output_channels;
+    int n = i / pooled_width / pooled_height / output_channels;
+
+    // set roi_batch_id
+    int roi_batch_id = rois_batch_id_data[n];
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+    // Force too small ROIs to be 1x1
+    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0), width);
+    wend = min(max(wend, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+    const T* offset_input_data =
+        input_data +
+        (roi_batch_id * input_channels + input_channel) * height * width;
+    T outsum = 0;
+
+    for (int ih = hstart; ih < hend; ++ih) {
+      for (int iw = wstart; iw < wend; ++iw) {
+        int input_index = ih * width + iw;
+        outsum += offset_input_data[input_index];
+      }
+    }
+
+    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+    output_data[i] = is_empty ? 0. : outsum / bin_area;
+  }
+}
+
+template <typename T, typename Context>
+void PsroiPoolKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& rois,
+                     paddle::optional<const DenseTensor&> rois_num,
+                     int pooled_height,
+                     int pooled_width,
+                     int output_channels,
+                     float spatial_scale,
+                     DenseTensor* out) {
+  auto in_dims = x.dims();
+  int batch_size = in_dims[0];
+  int input_channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+
+  PADDLE_ENFORCE_EQ(
+      input_channels,
+      output_channels * pooled_height * pooled_width,
+      errors::InvalidArgument(
+          "The channels %d of input X should equal the product of "
+          "output_channels %d x pooled_height %d x pooled_width %d.",
+          input_channels,
+          output_channels,
+          pooled_height,
+          pooled_width));
+
+  int rois_num_t = rois.dims()[0];
+  if (rois_num_t == 0) return;
+  int rois_batch_size;
+  DenseTensor rois_batch_id_list;
+  rois_batch_id_list.Resize({rois_num_t});
+  int* rois_batch_id_data = ctx.template HostAlloc<int>(&rois_batch_id_list);
+
+  if (rois_num.get_ptr()) {
+    rois_batch_size = rois_num->numel();
+    auto* rois_num_data = rois_num->data<int>();
+    PADDLE_ENFORCE_EQ(rois_batch_size,
+                      batch_size,
+                      errors::InvalidArgument(
+                          "The batch size of input(ROIs) and input(X) must be "
+                          "the same but received batch size of input(ROIs) and "
+                          "input(X) is %d and %d respectively.",
+                          rois_batch_size,
+                          batch_size));
+    std::vector<int> rois_num_list(rois_batch_size);
+    paddle::memory::Copy(CPUPlace(),
+                         rois_num_list.data(),
+                         ctx.GetPlace(),
+                         rois_num_data,
+                         sizeof(int) * rois_batch_size,
+                         0);
+    int rois_num_count = 0;
+    for (int i = 0; i < rois_batch_size; ++i) {
+      rois_num_count += rois_num_list[i];
+    }
+    PADDLE_ENFORCE_EQ(
+        rois_num_count,
+        rois_num_t,
+        errors::InvalidArgument(
+            "the rois_num from input and RoisNum must be the same"));
+    int start = 0;
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (int i = start; i < start + rois_num_list[n]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+      start += rois_num_list[n];
+    }
+  } else {
+    auto rois_lod = rois.lod().back();
+    rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(rois_batch_size,
+                      batch_size,
+                      errors::InvalidArgument(
+                          "The batch size of input(ROIs) and input(X) must be "
+                          "the same but received batch size of input(ROIs) and "
+                          "input(X) is %d and %d respectively.",
+                          rois_batch_size,
+                          batch_size));
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num_t,
+                      rois_num_with_lod,
+                      errors::InvalidArgument(
+                          "The number of rois from input(ROIs) and its LOD "
+                          "must be the same. Received rois %d of input(ROIs) "
+                          "but the number of rois %d from its LOD is %d",
+                          rois_num,
+                          rois_num_with_lod));
+
+    // set rois batch id
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+    }
+  }
+  DenseTensor rois_batch_id_list_gpu;
+  Copy(ctx, rois_batch_id_list, ctx.GetPlace(), false, &rois_batch_id_list_gpu);
+
+  int output_size = out->numel();
+  int blocks = NumBlocks(output_size);
+  int threads = kNumCUDAThreads;
+
+  // call cuda kernel function
+  GPUPSROIPoolForward<T><<<blocks, threads, 0, ctx.stream()>>>(
+      output_size,
+      x.data<T>(),
+      rois.data<T>(),
+      spatial_scale,
+      input_channels,
+      height,
+      width,
+      output_channels,
+      pooled_height,
+      pooled_width,
+      rois_batch_id_list_gpu.data<int>(),
+      ctx.template Alloc<T>(out));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    psroi_pool, GPU, ALL_LAYOUT, phi::PsroiPoolKernel, float, double) {
+  kernel->InputAt(2).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
index d4d90cac917a2..92948bf47c934 100644
--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -14,37 +14,161 @@
 
 #include "paddle/phi/kernels/randperm_kernel.h"
 
+#ifdef __NVCC__
+#include <curand_kernel.h>
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hiprand_kernel.h>
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/randint_kernel.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/memory/memcpy.h"
 
+DECLARE_bool(use_curand);
+
 namespace phi {
 
+template <typename T>
+__global__ void SwapRepeatKernel(
+    int* key, T* data, int n, uint64_t seed, uint64_t offset) {
+  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
+  if (idx < n) return;
+
+  bool first_repeat = false;
+  if (data[idx] == data[idx + 1]) {
+    if (idx == 0) {
+      first_repeat = true;
+    } else if (data[idx] != data[idx - 1]) {
+      first_repeat = true;
+    }
+  }
+
+  if (!first_repeat) return;
+
+  int repeat_size = 1;
+  for (int i = idx; i < n; ++i) {
+    if (data[i] == data[i + 1]) {
+      ++repeat_size;
+    } else {
+      break;
+    }
+  }
+
+#ifdef __NVCC__
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, idx, offset, &state);
+  for (int i = repeat_size - 1; i > 0; i--) {
+    uint32_t r = curand(&state) % (i + 1);
+#elif __HIPCC__
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(seed, idx, offset, &state);
+  for (int i = repeat_size - 1; i > 0; i--) {
+    uint32_t r = hiprand(&state) % (i + 1);
+#endif
+    if (r != i) {
+      T tmp = data[idx + i];
+      data[idx + i] = data[idx + r];
+      data[idx + r] = tmp;
+    }
+  }
+}
+
 template <typename T, typename Context>
 void RandpermRawKernel(
     const Context& dev_ctx, int n, DataType dtype, int seed, DenseTensor* out) {
-  DenseTensor tmp;
-  tmp.Resize(phi::make_ddim({n}));
-  T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp);
-
-  std::shared_ptr<std::mt19937_64> engine;
-  if (seed) {
-    engine = std::make_shared<std::mt19937_64>();
-    engine->seed(seed);
+  if (FLAGS_use_curand) {
+    DenseTensor key;
+    RandintKernel<int, Context>(dev_ctx,
+                                std::numeric_limits<int>::min(),
+                                std::numeric_limits<int>::max(),
+                                ScalarArray({n}),
+                                phi::DataType::INT32,
+                                &key);
+    DenseTensor key_out = Empty<int, Context>(dev_ctx, ScalarArray({n}));
+
+    DenseTensor range = Empty<T, Context>(dev_ctx, ScalarArray({n}));
+    T* range_data = range.data<T>();
+    funcs::ForRange<Context> for_range(dev_ctx, n);
+    for_range([range_data] __device__(size_t idx) {
+      range_data[idx] = static_cast<T>(idx);
+    });
+
+    out->Resize(phi::make_ddim({n}));
+    T* out_data = dev_ctx.template Alloc<T>(out);
+
+    // Refer to [Algorithm of randperm] https://osf.io/af2hy/ to
+    // improve performance of radix sort.
+    double n_d = static_cast<double>(n);
+    int begin_bit = 0;
+    int end_bit =
+        std::ceil(std::log2(n_d - (6 * n_d * n_d + 1) / (12 * std::log(0.9))));
+
+    size_t temp_storage_bytes = 0;
+    cub::DeviceRadixSort::SortPairs<int, T>(nullptr,
+                                            temp_storage_bytes,
+                                            key.data<int>(),
+                                            key_out.data<int>(),
+                                            range.data<T>(),
+                                            out_data,
+                                            n,
+                                            begin_bit,
+                                            end_bit < 32 ? end_bit : 32,
+                                            dev_ctx.stream());
+
+    auto d_temp_storage = paddle::memory::Alloc(dev_ctx, temp_storage_bytes);
+    cub::DeviceRadixSort::SortPairs<int, T>(d_temp_storage->ptr(),
+                                            temp_storage_bytes,
+                                            key.data<int>(),
+                                            key_out.data<int>(),
+                                            range.data<T>(),
+                                            out_data,
+                                            n,
+                                            begin_bit,
+                                            end_bit < 32 ? end_bit : 32,
+                                            dev_ctx.stream());
+
+    auto gen_cuda = dev_ctx.GetGenerator();
+    auto seed_offset = gen_cuda->IncrementOffset(n);
+    uint64_t seed = seed_offset.first;
+    uint64_t offset = seed_offset.second;
+
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n);
+    SwapRepeatKernel<T><<<config.block_per_grid.x,
+                          config.thread_per_block.x,
+                          0,
+                          dev_ctx.stream()>>>(
+        key_out.data<int>(), out_data, n, seed, offset);
   } else {
-    engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
-  }
+    DenseTensor tmp;
+    tmp.Resize(phi::make_ddim({n}));
+    T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp);
 
-  for (int i = 0; i < n; ++i) {
-    tmp_data[i] = static_cast<T>(i);
-  }
-  std::shuffle(tmp_data, tmp_data + n, *engine);
+    std::shared_ptr<std::mt19937_64> engine;
+    if (seed) {
+      engine = std::make_shared<std::mt19937_64>();
+      engine->seed(seed);
+    } else {
+      engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
+    }
 
-  T* out_data = dev_ctx.template Alloc<T>(out);
-  auto size = out->numel() * paddle::experimental::SizeOf(out->dtype());
-  paddle::memory::Copy<phi::GPUPlace, phi::Place>(
-      out->place(), out_data, tmp.place(), tmp_data, size, 0);
+    for (int i = 0; i < n; ++i) {
+      tmp_data[i] = static_cast<T>(i);
+    }
+    std::shuffle(tmp_data, tmp_data + n, *engine);
+
+    T* out_data = dev_ctx.template Alloc<T>(out);
+    auto size = out->numel() * paddle::experimental::SizeOf(out->dtype());
+    paddle::memory::Copy<phi::GPUPlace, phi::Place>(
+        out->place(), out_data, tmp.place(), tmp_data, size, 0);
+  }
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h
index a2b1c8631c7b4..e32101b73728f 100644
--- a/paddle/phi/kernels/gpu/reduce_grad.h
+++ b/paddle/phi/kernels/gpu/reduce_grad.h
@@ -23,6 +23,7 @@
 #include <set>
 #include <vector>
 
+#include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 
 namespace phi {
@@ -42,5 +43,59 @@ void ReduceGrad(const GPUContext& dev_ctx,
       }));
 }
 
+template <typename T,
+          typename Context,
+          template <typename, typename> class TransformOp>
+void ReduceGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DataType in_dtype,
+                      DataType out_dtype,
+                      DenseTensor* x_grad) {
+  auto* in_x = &x;
+  auto* d_out = &out_grad;
+  auto* d_x = x_grad;
+
+  auto pt_out_dtype = in_dtype;
+
+  // get reduce_dim and reduce_num for reduce_mean_grad
+  int dim_size = in_x->dims().size();
+  std::vector<int> reduce_dims =
+      funcs::details::GetReduceDim(dims, dim_size, reduce_all);
+
+  auto update_dims = vectorize(d_x->dims());
+  int reduce_num = 1;
+  for (auto i : reduce_dims) {
+    reduce_num *= (in_x->dims())[i];
+    update_dims[i] = 1;
+  }
+  // make new tensor
+  DenseTensor new_d_out(d_out->dtype());
+  new_d_out.ShareDataWith(*d_out);
+  new_d_out.Resize(phi::make_ddim(update_dims));
+  if (in_dtype != DataType::UNDEFINED) {
+    dev_ctx.Alloc(d_x, in_dtype);
+  } else {
+    dev_ctx.Alloc(d_x, d_out->dtype());
+  }
+
+  auto pt_d_out = new_d_out;
+  auto pt_d_x = *d_x;
+  if (in_dtype == DataType::UNDEFINED) {
+    pt_out_dtype = d_out->dtype();
+  }
+  using MPType = typename kps::details::MPTypeTrait<T>::Type;
+
+  phi::ReduceGrad<T, TransformOp<T, MPType>>(
+      dev_ctx,
+      &pt_d_out,
+      &pt_d_x,
+      pt_out_dtype,
+      TransformOp<T, MPType>(reduce_num));
+}
+
 }  // namespace phi
 #endif
diff --git a/paddle/phi/kernels/gpu/reduce_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_grad_kernel.cu
new file mode 100644
index 0000000000000..5256048267ea1
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_grad_kernel.cu
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/gpu/reduce_grad.h"
+#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceSumGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad) {
+  ReduceGradKernel<T, Context, kps::IdentityFunctor>(dev_ctx,
+                                                     x,
+                                                     out_grad,
+                                                     dims,
+                                                     keep_dim,
+                                                     reduce_all,
+                                                     in_dtype,
+                                                     out_dtype,
+                                                     x_grad);
+}
+
+template <typename T, typename Context>
+void ReduceMeanGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out_grad,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DataType in_dtype,
+                          DataType out_dtype,
+                          DenseTensor* x_grad) {
+  ReduceGradKernel<T, Context, kps::DivideFunctor>(dev_ctx,
+                                                   x,
+                                                   out_grad,
+                                                   dims,
+                                                   keep_dim,
+                                                   reduce_all,
+                                                   in_dtype,
+                                                   out_dtype,
+                                                   x_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sum_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceSumGradKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(mean_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMeanGradKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(prod_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceProdGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(max_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMaxGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(min_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMinGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/reduce_kernel.cu b/paddle/phi/kernels/gpu/reduce_kernel.cu
new file mode 100644
index 0000000000000..6cbe699e8e058
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_kernel.cu
@@ -0,0 +1,158 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MeanRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::AddFunctor, kps::DivideFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void SumRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DataType out_dtype,
+                  DenseTensor* out) {
+  phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void ProdRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MulFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MaxFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void MinRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MinFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void AllRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::LogicalAndFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void AnyRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::LogicalOrFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(sum_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SumRawKernel,
+                   bool,
+                   float,
+                   double,
+                   float16,
+                   bfloat16,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+
+PD_REGISTER_KERNEL(mean_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MeanRawKernel,
+                   float,
+                   double,
+                   bool,
+                   float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(prod_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ProdRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(
+    max_raw, GPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
+
+PD_REGISTER_KERNEL(
+    min_raw, GPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {}
+
+PD_REGISTER_KERNEL(all_raw, GPU, ALL_LAYOUT, phi::AllRawKernel, bool) {}
+
+PD_REGISTER_KERNEL(any_raw, GPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
diff --git a/paddle/phi/kernels/gpu/reduce_max_kernel.cu b/paddle/phi/kernels/gpu/reduce_max_kernel.cu
index 98c3986c51dd6..ddbc08b06c84b 100644
--- a/paddle/phi/kernels/gpu/reduce_max_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_max_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_max_kernel.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
diff --git a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
new file mode 100644
index 0000000000000..cf076128b6939
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
@@ -0,0 +1,260 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_align_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+static constexpr int kROISize = 4;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <class T>
+__device__ void BilinearInterpolateGradient(const int height,
+                                            const int width,
+                                            T y,
+                                            T x,
+                                            T* w1,
+                                            T* w2,
+                                            T* w3,
+                                            T* w4,
+                                            int* x_low,
+                                            int* x_high,
+                                            int* y_low,
+                                            int* y_high) {
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    return;
+  }
+
+  y = y <= 0 ? 0 : y;
+  x = x <= 0 ? 0 : x;
+  *y_low = static_cast<int>(y);
+  *x_low = static_cast<int>(x);
+  if (*y_low >= height - 1) {
+    *y_high = *y_low = height - 1;
+    y = static_cast<T>(*y_low);
+  } else {
+    *y_high = *y_low + 1;
+  }
+  if (*x_low >= width - 1) {
+    *x_high = *x_low = width - 1;
+    x = static_cast<T>(*x_low);
+  } else {
+    *x_high = *x_low + 1;
+  }
+  T ly = y - *y_low, lx = x - *x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
+
+  return;
+}
+
+template <typename T>
+__global__ void GPURoiAlignBackward(const int nthreads,
+                                    const T* input_rois,
+                                    const T* out_grad,
+                                    const int num_rois,
+                                    const float spatial_scale,
+                                    const int channels,
+                                    const int height,
+                                    const int width,
+                                    const int pooled_height,
+                                    const int pooled_width,
+                                    const int sampling_ratio,
+                                    int* roi_batch_id_data,
+                                    T* input_grad,
+                                    const bool continuous_coordinate) {
+  CUDA_KERNEL_LOOP(i, nthreads) {
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
+    const T* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = roi_batch_id_data[n];
+
+    T roi_offset = continuous_coordinate ? T(0.5) : 0;
+    T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset;
+    T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset;
+    T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset;
+    T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset;
+
+    T roi_width = roi_xmax - roi_xmin;
+    T roi_height = roi_ymax - roi_ymin;
+    if (!continuous_coordinate) {
+      roi_width = max(roi_width, static_cast<T>(1.));
+      roi_height = max(roi_height, static_cast<T>(1.));
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_input_grad =
+        input_grad + (roi_batch_ind * channels + c) * height * width;
+
+    const T* offset_out_grad =
+        out_grad + (n * channels + c) * pooled_height * pooled_width;
+    const T out_grad_this_bin = offset_out_grad[ph * pooled_width + pw];
+
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    const T count = roi_bin_grid_h * roi_bin_grid_w;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_ymin + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_xmin + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+        T w1 = 0, w2 = 0, w3 = 0, w4 = 0;
+        int x_low = -1, x_high = -1, y_low = -1, y_high = -1;
+        BilinearInterpolateGradient(height,
+                                    width,
+                                    y,
+                                    x,
+                                    &w1,
+                                    &w2,
+                                    &w3,
+                                    &w4,
+                                    &x_low,
+                                    &x_high,
+                                    &y_low,
+                                    &y_high);
+        T diff1 = out_grad_this_bin * w1 / count;
+        T diff2 = out_grad_this_bin * w2 / count;
+        T diff3 = out_grad_this_bin * w3 / count;
+        T diff4 = out_grad_this_bin * w4 / count;
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          paddle::platform::CudaAtomicAdd(
+              offset_input_grad + y_low * width + x_low, diff1);
+          paddle::platform::CudaAtomicAdd(
+              offset_input_grad + y_low * width + x_high, diff2);
+          paddle::platform::CudaAtomicAdd(
+              offset_input_grad + y_high * width + x_low, diff3);
+          paddle::platform::CudaAtomicAdd(
+              offset_input_grad + y_high * width + x_high, diff4);
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void RoiAlignGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& boxes,
+                        paddle::optional<const DenseTensor&> boxes_num,
+                        const DenseTensor& out_grad,
+                        int pooled_height,
+                        int pooled_width,
+                        float spatial_scale,
+                        int sampling_ratio,
+                        bool aligned,
+                        DenseTensor* dx) {
+  int rois_num = boxes.dims()[0];
+  int channels = x.dims()[1];
+  int height = x.dims()[2];
+  int width = x.dims()[3];
+
+  if (!dx) {
+    return;
+  }
+
+  DenseTensor box_batch_id_list;
+  box_batch_id_list.Resize({rois_num});
+  int* box_batch_size = dev_ctx.template HostAlloc<int>(&box_batch_id_list);
+
+  auto cplace = phi::CPUPlace();
+  auto gplace = dev_ctx.GetPlace();
+  if (boxes_num) {
+    int boxes_batch_size = boxes_num->numel();
+    std::vector<int> boxes_num_list(boxes_batch_size);
+    paddle::memory::Copy(cplace,
+                         boxes_num_list.data(),
+                         gplace,
+                         boxes_num->data<int>(),
+                         sizeof(int) * boxes_batch_size,
+                         0);
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (size_t i = start; i < start + boxes_num_list[n]; ++i) {
+        box_batch_size[i] = n;
+      }
+      start += boxes_num_list[n];
+    }
+  } else {
+    auto boxes_lod = boxes.lod().back();
+    int boxes_batch_size = boxes_lod.size() - 1;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        box_batch_size[i] = n;
+      }
+    }
+  }
+  auto roi_ptr =
+      paddle::memory::Alloc(dev_ctx, box_batch_id_list.numel() * sizeof(int));
+  int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+  int bytes = box_batch_id_list.numel() * sizeof(int);
+  paddle::memory::Copy(
+      gplace, roi_id_data, cplace, box_batch_size, bytes, dev_ctx.stream());
+  dev_ctx.template Alloc<T>(dx);
+
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, dx, static_cast<T>(0));
+
+  int output_grad_size = out_grad.numel();
+  int blocks = NumBlocks(output_grad_size);
+  int threads = kNumCUDAThreads;
+
+  if (output_grad_size > 0) {
+    GPURoiAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+        output_grad_size,
+        boxes.data<T>(),
+        out_grad.data<T>(),
+        rois_num,
+        spatial_scale,
+        channels,
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        sampling_ratio,
+        roi_id_data,
+        dx->data<T>(),
+        aligned);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_align_grad, GPU, ALL_LAYOUT, phi::RoiAlignGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/roi_align_kernel.cu b/paddle/phi/kernels/gpu/roi_align_kernel.cu
new file mode 100644
index 0000000000000..cb3375dee95a5
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu
@@ -0,0 +1,254 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_align_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/memory/memory.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+static constexpr int kROISize = 4;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <class T>
+__device__ T BilinearInterpolate(
+    const T* input_data, const int height, const int width, T y, T x) {
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    return 0;
+  }
+  y = y <= 0 ? 0 : y;
+  x = x <= 0 ? 0 : x;
+  int y_low = static_cast<int>(y);
+  int x_low = static_cast<int>(x);
+  int y_high;
+  int x_high;
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = static_cast<T>(y_low);
+  } else {
+    y_high = y_low + 1;
+  }
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = static_cast<T>(x_low);
+  } else {
+    x_high = x_low + 1;
+  }
+  T ly = y - y_low, lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  T v1 = input_data[y_low * width + x_low];
+  T v2 = input_data[y_low * width + x_high];
+  T v3 = input_data[y_high * width + x_low];
+  T v4 = input_data[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <class T>
+__global__ void GPURoiAlignForward(const int nthreads,
+                                   const T* input_data,
+                                   const T* input_rois,
+                                   const float spatial_scale,
+                                   const int channels,
+                                   const int height,
+                                   const int width,
+                                   const int pooled_height,
+                                   const int pooled_width,
+                                   const int sampling_ratio,
+                                   int* roi_batch_id_data,
+                                   T* output_data,
+                                   const bool continuous_coordinate) {
+  CUDA_KERNEL_LOOP(i, nthreads) {
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
+
+    const T* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = roi_batch_id_data[n];
+
+    T roi_offset = continuous_coordinate ? static_cast<T>(0.5) : 0;
+    T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset;
+    T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset;
+    T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset;
+    T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset;
+
+    T roi_width = roi_xmax - roi_xmin;
+    T roi_height = roi_ymax - roi_ymin;
+    if (!continuous_coordinate) {
+      roi_width = max(roi_width, static_cast<T>(1.));
+      roi_height = max(roi_height, static_cast<T>(1.));
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input_data =
+        input_data + (roi_batch_ind * channels + c) * height * width;
+
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
+    T output_val = 0;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_ymin + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_xmin + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+        T val = BilinearInterpolate(offset_input_data, height, width, y, x);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+    output_data[i] = output_val;
+  }
+}
+
+template <typename T, typename Context>
+void RoiAlignKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& boxes,
+                    paddle::optional<const DenseTensor&> boxes_num,
+                    int pooled_height,
+                    int pooled_width,
+                    float spatial_scale,
+                    int sampling_ratio,
+                    bool aligned,
+                    DenseTensor* out) {
+  auto in_dims = x.dims();
+  int batch_size = in_dims[0];
+  int channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+
+  int rois_num = boxes.dims()[0];
+
+  if (rois_num == 0) return;
+
+  int output_size = out->numel();
+  int blocks = NumBlocks(output_size);
+  int threads = kNumCUDAThreads;
+#ifdef WITH_NV_JETSON
+  backends::gpu::ChangeThreadNum(dev_ctx, &threads, 256);
+#endif
+  DenseTensor roi_batch_id_list;
+  roi_batch_id_list.Resize({rois_num});
+  int* roi_batch_id_data = dev_ctx.template HostAlloc<int>(&roi_batch_id_list);
+  auto cplace = phi::CPUPlace();
+  auto gplace = dev_ctx.GetPlace();
+  if (boxes_num) {
+    int boxes_batch_size = boxes_num->numel();
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        errors::InvalidArgument(
+            "The boxes_batch_size and imgs "
+            "batch_size must be the same. But received boxes_batch_size = %d, "
+            "batch_size = %d",
+            boxes_batch_size,
+            batch_size));
+
+    std::vector<int> boxes_num_list(boxes_batch_size);
+    paddle::memory::Copy(cplace,
+                         boxes_num_list.data(),
+                         gplace,
+                         boxes_num->data<int>(),
+                         sizeof(int) * boxes_batch_size,
+                         0);
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_list[n]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+      start += boxes_num_list[n];
+    }
+  } else {
+    auto lod = boxes.lod();
+    PADDLE_ENFORCE_EQ(lod.empty(),
+                      false,
+                      errors::InvalidArgument("Input(ROIs) in ROIAlignOp does "
+                                              "not contain LoD information."));
+    auto boxes_lod = lod.back();
+    int boxes_batch_size = boxes_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        errors::InvalidArgument(
+            "The batch size of rois and batch size "
+            "of images must be the same. But received rois batch size = %d, "
+            "and images batch size = %d",
+            boxes_batch_size,
+            batch_size));
+    int boxes_num_with_lod = boxes_lod[boxes_batch_size];
+    PADDLE_ENFORCE_EQ(
+        rois_num,
+        boxes_num_with_lod,
+        errors::InvalidArgument(
+            "The actual number of rois and the number of rois "
+            "provided from Input(RoIsLoD) in RoIAlign must be the same."
+            " But received actual number of rois is %d, and the number "
+            "of rois from RoIsLoD is %d",
+            rois_num,
+            boxes_num_with_lod));
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+    }
+  }
+  int bytes = roi_batch_id_list.numel() * sizeof(int);
+  auto roi_ptr = paddle::memory::Alloc(dev_ctx, bytes);
+  int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+  paddle::memory::Copy(
+      gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream());
+  GPURoiAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+      output_size,
+      x.data<T>(),
+      boxes.data<T>(),
+      spatial_scale,
+      channels,
+      height,
+      width,
+      pooled_height,
+      pooled_width,
+      sampling_ratio,
+      roi_id_data,
+      dev_ctx.template Alloc<T>(out),
+      aligned);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_align, GPU, ALL_LAYOUT, phi::RoiAlignKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
new file mode 100644
index 0000000000000..d093a71d23f4e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
@@ -0,0 +1,165 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_pool_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPURoiPoolBackward(const int nthreads,
+                                   const T* input_rois,
+                                   const T* output_grad,
+                                   const int64_t* arg_max_data,
+                                   const int num_rois,
+                                   const float spatial_scale,
+                                   const int channels,
+                                   const int height,
+                                   const int width,
+                                   const int pooled_height,
+                                   const int pooled_width,
+                                   int* box_batch_id_data,
+                                   T* input_grad) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
+
+    int roi_batch_ind = box_batch_id_data[n];
+    int input_offset = (roi_batch_ind * channels + c) * height * width;
+    int output_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_output_grad = output_grad + output_offset;
+    T* offset_input_grad = input_grad + input_offset;
+    const int64_t* offset_arg_max_data = arg_max_data + output_offset;
+
+    int arg_max = offset_arg_max_data[ph * pooled_width + pw];
+    if (arg_max != -1) {
+      paddle::platform::CudaAtomicAdd(
+          offset_input_grad + arg_max,
+          static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
+    }
+  }
+}
+
+template <typename T, typename Context>
+void RoiPoolGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& boxes,
+                       paddle::optional<const DenseTensor&> boxes_num,
+                       const DenseTensor& arg_max,
+                       const DenseTensor& out_grad,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       DenseTensor* dx) {
+  auto x_dims = x.dims();
+  int channels = x_dims[1];
+  int height = x_dims[2];
+  int width = x_dims[3];
+  int rois_num = boxes.dims()[0];
+
+  if (dx) {
+    DenseTensor box_batch_id_list;
+    box_batch_id_list.Resize({rois_num});
+    int* box_batch_id_data =
+        dev_ctx.template HostAlloc<int>(&box_batch_id_list);
+
+    auto gplace = dev_ctx.GetPlace();
+    if (boxes_num) {
+      int boxes_batch_size = boxes_num->numel();
+      std::vector<int> boxes_num_list(boxes_batch_size);
+      paddle::memory::Copy(phi::CPUPlace(),
+                           boxes_num_list.data(),
+                           gplace,
+                           boxes_num->data<int>(),
+                           sizeof(int) * boxes_batch_size,
+                           0);
+      int start = 0;
+      for (int n = 0; n < boxes_batch_size; ++n) {
+        for (int i = start; i < start + boxes_num_list[n]; ++i) {
+          box_batch_id_data[i] = n;
+        }
+        start += boxes_num_list[n];
+      }
+    } else {
+      auto boxes_lod = boxes.lod().back();
+      int boxes_batch_size = boxes_lod.size() - 1;
+      for (int n = 0; n < boxes_batch_size; ++n) {
+        for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+          box_batch_id_data[i] = n;
+        }
+      }
+    }
+    int bytes = box_batch_id_list.numel() * sizeof(int);
+    auto roi_ptr = paddle::memory::Alloc(dev_ctx, bytes);
+    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    paddle::memory::Copy(gplace,
+                         roi_id_data,
+                         phi::CPUPlace(),
+                         box_batch_id_data,
+                         bytes,
+                         dev_ctx.stream());
+
+    dev_ctx.template Alloc<T>(dx);
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, dx, static_cast<T>(0));
+
+    int output_grad_size = out_grad.numel();
+    int blocks = NumBlocks(output_grad_size);
+    int threads = kNumCUDAThreads;
+
+    if (output_grad_size > 0) {
+      GPURoiPoolBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          output_grad_size,
+          boxes.data<T>(),
+          out_grad.data<T>(),
+          arg_max.data<int64_t>(),
+          rois_num,
+          spatial_scale,
+          channels,
+          height,
+          width,
+          pooled_height,
+          pooled_width,
+          roi_id_data,
+          dx->data<T>());
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_pool_grad, GPU, ALL_LAYOUT, phi::RoiPoolGradKernel, float, double) {
+  kernel->InputAt(3).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/phi/kernels/gpu/roi_pool_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
new file mode 100644
index 0000000000000..ab33e2cf64751
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
@@ -0,0 +1,220 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_pool_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/memory/memory.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPURoiPoolForward(const int nthreads,
+                                  const T* input_data,
+                                  const T* input_rois,
+                                  const float spatial_scale,
+                                  const int channels,
+                                  const int height,
+                                  const int width,
+                                  const int pooled_height,
+                                  const int pooled_width,
+                                  int* box_batch_id_data,
+                                  T* output_data,
+                                  int64_t* arg_max_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
+
+    const T* offset_input_rois = input_rois + n * kROISize;
+    int box_batch_ind = box_batch_id_data[n];
+    int box_start_w = round(offset_input_rois[0] * spatial_scale);
+    int box_start_h = round(offset_input_rois[1] * spatial_scale);
+    int box_end_w = round(offset_input_rois[2] * spatial_scale);
+    int box_end_h = round(offset_input_rois[3] * spatial_scale);
+
+    int box_width = max(box_end_w - box_start_w + 1, 1);
+    int box_height = max(box_end_h - box_start_h + 1, 1);
+
+    int hstart = static_cast<int>(floor(static_cast<double>(ph) *
+                                        static_cast<double>(box_height) /
+                                        static_cast<double>(pooled_height)));
+    int wstart = static_cast<int>(floor(static_cast<double>(pw) *
+                                        static_cast<double>(box_width) /
+                                        static_cast<double>(pooled_width)));
+    int hend = static_cast<int>(ceil(static_cast<double>(ph + 1) *
+                                     static_cast<double>(box_height) /
+                                     static_cast<double>(pooled_height)));
+    int wend = static_cast<int>(ceil(static_cast<double>(pw + 1) *
+                                     static_cast<double>(box_width) /
+                                     static_cast<double>(pooled_width)));
+    hstart = min(max(hstart + box_start_h, 0), height);
+    hend = min(max(hend + box_start_h, 0), height);
+    wstart = min(max(wstart + box_start_w, 0), width);
+    wend = min(max(wend + box_start_w, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
+    int maxidx = -1;
+    const T* offset_input_data =
+        input_data + (box_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int input_data_index = h * width + w;
+        if (offset_input_data[input_data_index] > maxval) {
+          maxval = offset_input_data[input_data_index];
+          maxidx = input_data_index;
+        }
+      }
+    }
+    output_data[i] = maxval;
+    if (arg_max_data) {
+      arg_max_data[i] = maxidx;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void RoiPoolKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& boxes,
+                   paddle::optional<const DenseTensor&> boxes_num,
+                   int pooled_height,
+                   int pooled_width,
+                   float spatial_scale,
+                   DenseTensor* out,
+                   DenseTensor* arg_max) {
+  auto x_dims = x.dims();
+  int batch_size = x_dims[0];
+  auto in_stride = phi::stride(x_dims);
+  int channels = x_dims[1];
+  int height = x_dims[2];
+  int width = x_dims[3];
+
+  int rois_num = boxes.dims()[0];
+
+  if (rois_num == 0) return;
+
+  int output_size = out->numel();
+  int blocks = NumBlocks(output_size);
+  int threads = kNumCUDAThreads;
+
+  DenseTensor box_batch_id_list;
+  box_batch_id_list.Resize({rois_num});
+  int* box_batch_id_data = dev_ctx.template HostAlloc<int>(&box_batch_id_list);
+  auto gplace = dev_ctx.GetPlace();
+
+  if (boxes_num) {
+    int boxes_batch_size = boxes_num->numel();
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        phi::errors::InvalidArgument(
+            "The batch size of input(ROIs) and input(X) must be the same but "
+            "received batch size of input(ROIs) and input(X) is %d and %d "
+            "respectively.",
+            boxes_batch_size,
+            batch_size));
+    std::vector<int> boxes_num_list(boxes_batch_size);
+    paddle::memory::Copy(phi::CPUPlace(),
+                         boxes_num_list.data(),
+                         gplace,
+                         boxes_num->data<int>(),
+                         sizeof(int) * boxes_batch_size,
+                         0);
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_list[n]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+      start += boxes_num_list[n];
+    }
+  } else {
+    auto boxes_lod = boxes.lod().back();
+    int boxes_batch_size = boxes_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        phi::errors::InvalidArgument(
+            "The batch size of input(ROIs) and input(X) must be the same but "
+            "received batch size of input(ROIs) and input(X) is %d and %d "
+            "respectively.",
+            boxes_batch_size,
+            batch_size));
+
+    int boxes_num_with_lod = boxes_lod[boxes_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num,
+                      boxes_num_with_lod,
+                      phi::errors::InvalidArgument(
+                          "The number of rois from input(ROIs) and its LOD "
+                          "must be the same. Received rois %d of input(ROIs) "
+                          "but the number of rois %d from its LOD is %d",
+                          rois_num,
+                          boxes_num_with_lod));
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+    }
+  }
+
+  int bytes = box_batch_id_list.numel() * sizeof(int);
+  auto box_ptr = paddle::memory::Alloc(dev_ctx, bytes);
+  int* box_id_data = reinterpret_cast<int*>(box_ptr->ptr());
+  paddle::memory::Copy(gplace,
+                       box_id_data,
+                       phi::CPUPlace(),
+                       box_batch_id_data,
+                       bytes,
+                       dev_ctx.stream());
+
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* arg_max_data = dev_ctx.template Alloc<int64_t>(arg_max);
+
+  GPURoiPoolForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+      output_size,
+      x.data<T>(),
+      boxes.data<T>(),
+      spatial_scale,
+      channels,
+      height,
+      width,
+      pooled_height,
+      pooled_width,
+      box_id_data,
+      output_data,
+      arg_max_data);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_pool, GPU, ALL_LAYOUT, phi::RoiPoolKernel, float, double) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/phi/kernels/gpu/roll_grad_kernel.cu b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
new file mode 100644
index 0000000000000..93e9e81882c9e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roll_grad_kernel.h"
+
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/roll_kernel_impl.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, typename Context>
+void RollGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& shifts,
+                    const std::vector<int64_t>& axis,
+                    DenseTensor* x_grad) {
+  auto* in_data = out_grad.data<T>();
+  T* out_data = dev_ctx.template Alloc<T>(x_grad);
+  int64_t numel = out_grad.numel();
+  auto stream = dev_ctx.stream();
+
+  auto shifts_data = shifts.GetData();
+  size_t nums = shifts_data.size();
+  auto input_dim = out_grad.dims();
+  auto stride_dim = phi::stride(input_dim);
+
+  std::vector<int64_t> strides(nums), sizes(nums);
+  if (axis.size() == 0) {
+    strides[0] = 1;
+    sizes[0] = numel;
+    shifts_data[0] = ((-shifts_data[0]) % numel + numel) % numel;
+  } else {
+    for (size_t i = 0; i < nums; i++) {
+      int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size();
+      int64_t size = input_dim[dim];
+      if (size != 0) {
+        shifts_data[i] = ((-shifts_data[i]) % size + size) % size;
+        strides[i] = stride_dim[dim];
+        sizes[i] = size;
+      }
+    }
+  }
+
+  switch (nums) {
+    CALL_ROLL_CUDA_KERNEL(1);
+    CALL_ROLL_CUDA_KERNEL(2);
+    CALL_ROLL_CUDA_KERNEL(3);
+    CALL_ROLL_CUDA_KERNEL(4);
+    CALL_ROLL_CUDA_KERNEL(5);
+    CALL_ROLL_CUDA_KERNEL(6);
+    CALL_ROLL_CUDA_KERNEL(7);
+    CALL_ROLL_CUDA_KERNEL(8);
+    CALL_ROLL_CUDA_KERNEL(9);
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "shifts.size() should be less than 10, But received shifts.size() "
+          "= %d",
+          shifts_data.size()));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roll_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RollGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/roll_kernel.cu b/paddle/phi/kernels/gpu/roll_kernel.cu
new file mode 100644
index 0000000000000..1543335d3a0c5
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roll_kernel.cu
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roll_kernel.h"
+
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/array.h"
+#include "paddle/phi/kernels/gpu/roll_kernel_impl.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, typename Context>
+void RollKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& shifts,
+                const std::vector<int64_t>& axis,
+                DenseTensor* out) {
+  auto* in_data = x.data<T>();
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  int64_t numel = x.numel();
+  auto stream = dev_ctx.stream();
+
+  auto shifts_data = shifts.GetData();
+
+  size_t nums = shifts_data.size();
+  auto input_dim = x.dims();
+  auto stride_dim = phi::stride(input_dim);
+
+  std::vector<int64_t> strides(nums), sizes(nums);
+  if (axis.size() == 0) {
+    strides[0] = 1;
+    sizes[0] = numel;
+    shifts_data[0] = (shifts_data[0] % numel + numel) % numel;
+  } else {
+    for (size_t i = 0; i < nums; i++) {
+      int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size();
+      int64_t size = input_dim[dim];
+
+      if (size != 0) {
+        shifts_data[i] = (shifts_data[i] % size + size) % size;
+        strides[i] = stride_dim[dim];
+        sizes[i] = size;
+      }
+    }
+  }
+
+  switch (nums) {
+    CALL_ROLL_CUDA_KERNEL(1);
+    CALL_ROLL_CUDA_KERNEL(2);
+    CALL_ROLL_CUDA_KERNEL(3);
+    CALL_ROLL_CUDA_KERNEL(4);
+    CALL_ROLL_CUDA_KERNEL(5);
+    CALL_ROLL_CUDA_KERNEL(6);
+    CALL_ROLL_CUDA_KERNEL(7);
+    CALL_ROLL_CUDA_KERNEL(8);
+    CALL_ROLL_CUDA_KERNEL(9);
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "shifts.size() should be less than 10, But received shifts.size() "
+          "= %d",
+          shifts_data.size()));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roll,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RollKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/roll_kernel_impl.h b/paddle/phi/kernels/gpu/roll_kernel_impl.h
new file mode 100644
index 0000000000000..abe3ee470b4bc
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/core/utils/array.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, size_t Rank>
+__global__ void RollCudaKernel(const T* input,
+                               T* output,
+                               int64_t N,
+                               phi::Array<int64_t, Rank> shifts,
+                               phi::Array<int64_t, Rank> strides,
+                               phi::Array<int64_t, Rank> sizes) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+
+  int64_t output_idx = idx;
+  int64_t new_dim_idx = 0;
+
+#pragma unroll
+  for (size_t i = 0; i < Rank; i++) {
+    new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i];
+    if (new_dim_idx >= sizes[i]) {
+      output_idx += (shifts[i] - sizes[i]) * strides[i];
+    } else {
+      output_idx += shifts[i] * strides[i];
+    }
+  }
+  output[output_idx] = input[idx];
+}
+
+#define CALL_ROLL_CUDA_KERNEL(N)                                              \
+  case N: {                                                                   \
+    phi::Array<int64_t, N> _strides;                                          \
+    phi::Array<int64_t, N> _shifts;                                           \
+    phi::Array<int64_t, N> _sizes;                                            \
+    for (size_t idx = 0; idx < N; ++idx) {                                    \
+      _strides[idx] = strides[idx];                                           \
+      _shifts[idx] = shifts_data[idx];                                        \
+      _sizes[idx] = sizes[idx];                                               \
+    }                                                                         \
+    RollCudaKernel<                                                           \
+        T,                                                                    \
+        N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \
+             PADDLE_CUDA_NUM_THREADS,                                         \
+             0,                                                               \
+             stream>>>(in_data, out_data, numel, _shifts, _strides, _sizes);  \
+    break;                                                                    \
+  }
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/scale_kernel.cu b/paddle/phi/kernels/gpu/scale_kernel.cu
index 930c50a24be8f..6f96a697b2f2d 100644
--- a/paddle/phi/kernels/gpu/scale_kernel.cu
+++ b/paddle/phi/kernels/gpu/scale_kernel.cu
@@ -15,10 +15,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/scale_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/phi/common/float16.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/searchsorted_kernel.cu b/paddle/phi/kernels/gpu/searchsorted_kernel.cu
new file mode 100644
index 0000000000000..4a2ce2241c22d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/searchsorted_kernel.cu
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/searchsorted_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/searchsorted_kernel_impl.h"
+
+PD_REGISTER_KERNEL(searchsorted,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SearchsortedKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
new file mode 100644
index 0000000000000..9d1769e18b4b8
--- /dev/null
+++ b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h"
+#include "paddle/phi/kernels/segment_pool_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(segment_pool_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SegmentPoolGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/segment_pool_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
new file mode 100644
index 0000000000000..3128e534166ac
--- /dev/null
+++ b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/segment_pool_kernel_impl.h"
+#include "paddle/phi/kernels/segment_pool_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(segment_pool,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SegmentPoolKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/set_value_grad_kernel.cu b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
new file mode 100644
index 0000000000000..7eed96699e720
--- /dev/null
+++ b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/set_value_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/set_value_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(set_value_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SetValueGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/gpu/shard_index_kernel.cu b/paddle/phi/kernels/gpu/shard_index_kernel.cu
new file mode 100644
index 0000000000000..0bd7b93f68928
--- /dev/null
+++ b/paddle/phi/kernels/gpu/shard_index_kernel.cu
@@ -0,0 +1,99 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/shard_index_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T>
+__global__ void ShardIndexInner(const T* in_data,
+                                T* out_data,
+                                const int64_t numel,
+                                const int index_num,
+                                const int nshards,
+                                const int shard_id,
+                                const int ignore_value) {
+  int shard_size = (index_num + nshards - 1) / nshards;
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < numel) {
+    assert(in_data[idx] >= 0 && in_data[idx] < index_num);
+    if (in_data[idx] / shard_size == shard_id) {
+      out_data[idx] = in_data[idx] % shard_size;
+    } else {
+      out_data[idx] = ignore_value;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ShardIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& in,
+                      int index_num,
+                      int nshards,
+                      int shard_id,
+                      int ignore_value,
+                      DenseTensor* out) {
+  PADDLE_ENFORCE_GT(
+      index_num,
+      0,
+      phi::errors::InvalidArgument(
+          "The value 'index_num' for Op(shard_index) must be greater than 0, "
+          "but the value given is %d.",
+          index_num));
+  PADDLE_ENFORCE_GT(nshards,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The value 'nshard' for Op(shard_index) must be "
+                        "greater than 0, but the value given is %d.",
+                        nshards));
+  PADDLE_ENFORCE_GE(
+      shard_id,
+      0,
+      phi::errors::InvalidArgument(
+          "The value 'shard_id' for Op(shard_index) must be greater or "
+          "equal to 0, but the value given is %d.",
+          shard_id));
+  PADDLE_ENFORCE_LT(
+      shard_id,
+      nshards,
+      phi::errors::InvalidArgument(
+          "The value 'shard_id' for Op(shard_index) must be less than "
+          "nshards (%d), but the value given is %d.",
+          nshards,
+          shard_id));
+
+  out->Resize(in.dims());
+  out->set_lod(in.lod());
+  auto* in_data = in.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+  int64_t numel = in.numel();
+  auto stream = dev_ctx.stream();
+  ShardIndexInner<
+      T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+           PADDLE_CUDA_NUM_THREADS,
+           0,
+           stream>>>(
+      in_data, out_data, numel, index_num, nshards, shard_id, ignore_value);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    shard_index, GPU, ALL_LAYOUT, phi::ShardIndexKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/softmax_kernel.cu b/paddle/phi/kernels/gpu/softmax_kernel.cu
index 03c5714b96784..4a02f438c7e7e 100644
--- a/paddle/phi/kernels/gpu/softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/softmax_kernel.cu
@@ -23,7 +23,7 @@ limitations under the License. */
 PD_REGISTER_KERNEL(softmax,
                    GPU,
                    ALL_LAYOUT,
-                   phi::SoftmaxRawKernel,
+                   phi::SoftmaxKernel,
                    float,
                    double,
                    phi::dtype::float16,
diff --git a/paddle/phi/kernels/gpu/split_kernel.cu b/paddle/phi/kernels/gpu/split_kernel.cu
index c28fc3794f092..83c2ec4b6e99d 100644
--- a/paddle/phi/kernels/gpu/split_kernel.cu
+++ b/paddle/phi/kernels/gpu/split_kernel.cu
@@ -37,7 +37,7 @@ void SplitKernel(const Context& dev_ctx,
       out_metas_ptr.push_back(&out_metas.back());
     }
 
-    phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr, true);
+    phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr);
 
     for (size_t i = 0; i < out_metas.size(); ++i) {
       outs[i]->Resize(out_metas[i].dims());
diff --git a/paddle/phi/kernels/gpu/tile_grad_kernel.cu b/paddle/phi/kernels/gpu/tile_grad_kernel.cu
new file mode 100644
index 0000000000000..c092609e623d3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/tile_grad_kernel.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tile_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tile_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(tile_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TileGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/tile_kernel.cu b/paddle/phi/kernels/gpu/tile_kernel.cu
new file mode 100644
index 0000000000000..0c3c29e82c42a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/tile_kernel.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tile_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tile_kernel_impl.h"
+
+PD_REGISTER_KERNEL(tile,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TileKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/top_k_grad_kernel.cu b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
new file mode 100644
index 0000000000000..b0b45223489e9
--- /dev/null
+++ b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
@@ -0,0 +1,87 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/top_k_grad_kernel.h"
+
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+namespace ops = paddle::operators;
+
+template <typename T, typename Context>
+void TopkGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_grad,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    int k,
+                    int axis,
+                    bool largest,
+                    bool sorted,
+                    DenseTensor* x_grad) {
+  const auto& in_dims = x.dims();
+  const auto& out_dims = indices.dims();
+
+  // get the real the axis and the k
+  if (axis < 0) {
+    axis += in_dims.size();
+  }
+  const int& raw_height = in_dims[axis];
+
+  // allocate the cuda memory for the x_grad
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+  const T* out_grad_data = out_grad.data<T>();
+  const int64_t* indices_data = indices.data<int64_t>();
+
+  int pre, n, post;
+  ops::GetDims(in_dims, axis, &pre, &n, &post);
+
+  // calcluate the block and grid num
+  auto ComputeBlockSize = [](int col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256 && col <= 512)
+      return 512;
+    else if (col > 128 && col <= 256)
+      return 256;
+    else if (col > 64 && col <= 128)
+      return 128;
+    else
+      return 64;
+  };
+  int block_size = ComputeBlockSize(post * k);
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+  int grid_size = std::min(max_blocks, pre);
+
+  // lanuch the cuda kernel to assign the grad
+  ops::AssignGradWithAxis<
+      T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+      out_grad_data, indices_data, x_grad_data, pre, post, n, k);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(top_k_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TopkGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu
new file mode 100644
index 0000000000000..7f06af7de43f7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
@@ -0,0 +1,266 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/top_k_kernel.h"
+
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+namespace ops = paddle::operators;
+
+#define FIXED_BLOCK_DIM_BASE(dim, ...) \
+  case (dim): {                        \
+    constexpr auto kBlockDim = (dim);  \
+    __VA_ARGS__;                       \
+  } break
+
+#define FIXED_BLOCK_DIM(...)                \
+  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
+
+template <typename T, typename Context>
+void TopkKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const Scalar& k_scalar,
+                int axis,
+                bool largest,
+                bool sorted,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  const auto* input = &x;
+  // get the input dims
+  const auto& in_dims = input->dims();
+  // calcluate the real axis
+  if (axis < 0) axis += in_dims.size();
+
+  int k = k_scalar.to<int>();
+  if (k_scalar.FromTensor()) {
+    phi::DDim out_dims = out->dims();
+    out_dims[axis] = k;
+    out->Resize(out_dims);
+    indices->Resize(out_dims);
+  }
+
+  const auto& out_dims = out->dims();
+
+  const T* input_data = input->data<T>();
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+
+  if (axis == in_dims.size() - 1) {
+    // if get the topK from the last axis
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+
+    if (k > input_width) {
+      k = input_width;
+    }
+
+    // The conclusion is drawn from the data through multiple sets of
+    // statistics
+    if (input_width >= 128 && k >= input_width * 0.75) {
+      auto* ctx = reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
+          &dev_ctx);
+      if (ops::SortTopk<T>(*ctx,
+                           input,
+                           input_width,
+                           input_height,
+                           k,
+                           out,
+                           indices,
+                           largest)) {
+        // Successed, return.
+        return;
+      } else {
+        LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
+                     "default topk kernel.";
+      }
+    }
+
+    // NOTE: pass lds and dim same to input width.
+    // NOTE: old matrix implementation of stride is different to eigen.
+    const int kMaxHeight = 2048;
+    int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
+    switch (ops::GetDesiredBlockDim(input_width)) {
+#ifdef PADDLE_WITH_HIP
+      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
+                      T,
+                      20,
+                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          output_data,
+          k,
+          indices_data,
+          input_data,
+          input_width,
+          input_width,
+          static_cast<int>(k),
+          gridx,
+          input_height,
+          largest));
+#else
+      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
+                      T,
+                      5,
+                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          output_data,
+          k,
+          indices_data,
+          input_data,
+          input_width,
+          input_width,
+          static_cast<int>(k),
+          gridx,
+          input_height,
+          largest));
+#endif
+      default:
+        PADDLE_THROW(errors::Fatal(
+            "the input data shape has error in the topk cuda kernel."));
+    }
+  } else {
+    // if get topK not from the last axis, will tranpose the tensor and get
+    // TopK
+
+    // first step, prepare the trans args for the tranpose
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+
+    phi::DDim trans_dims(in_dims);
+    phi::DDim trans_out_dims(out->dims());
+    for (int i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+      trans_out_dims[i] = out_dims[trans[i]];
+    }
+    // second step, tranpose the input
+    DenseTensor trans_input;
+    trans_input.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_input);
+    int ndims = trans.size();
+    funcs::TransCompute<phi::GPUContext, T>(
+        ndims, dev_ctx, *input, &trans_input, trans);
+    // third step, calcluate the topk
+    // allocate the tmp cuda memory for the tmp result
+    DenseTensor trans_ind;
+    DenseTensor trans_out;
+    trans_ind.Resize(trans_out_dims);
+    trans_out.Resize(trans_out_dims);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    dev_ctx.template Alloc<T>(&trans_out);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    if (k > input_width) k = input_width;
+
+    // The conclusion is drawn from the data through multiple sets of
+    // statistics
+    if (input_width >= 128 && k >= input_width * 0.75) {
+      auto* ctx = reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
+          &dev_ctx);
+      if (ops::SortTopk<T>(*ctx,
+                           &trans_input,
+                           input_width,
+                           input_height,
+                           k,
+                           &trans_out,
+                           &trans_ind,
+                           largest)) {
+        // last step, tranpose back the indices and output
+        funcs::TransCompute<phi::GPUContext, int64_t>(
+            ndims, dev_ctx, trans_ind, indices, trans);
+        funcs::TransCompute<phi::GPUContext, T>(
+            ndims, dev_ctx, trans_out, out, trans);
+        return;
+      } else {
+        LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
+                     "default topk kernel.";
+      }
+    }
+
+    const int kMaxHeight = 2048;
+    int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
+    switch (ops::GetDesiredBlockDim(input_width)) {
+#ifdef PADDLE_WITH_HIP
+      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
+                      T,
+                      20,
+                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          trans_out.data<T>(),
+          k,
+          trans_ind.data<int64_t>(),
+          trans_input.data<T>(),
+          input_width,
+          input_width,
+          static_cast<int>(k),
+          gridx,
+          input_height,
+          largest));
+#else
+      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
+                      T,
+                      5,
+                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          trans_out.data<T>(),
+          k,
+          trans_ind.data<int64_t>(),
+          trans_input.data<T>(),
+          input_width,
+          input_width,
+          static_cast<int>(k),
+          gridx,
+          input_height,
+          largest));
+#endif
+      default:
+        PADDLE_THROW(errors::Fatal(
+            "the input data shape has error in the topk cuda kernel."));
+    }
+
+    // last step, tranpose back the indices and output
+    funcs::TransCompute<phi::GPUContext, int64_t>(
+        ndims, dev_ctx, trans_ind, indices, trans);
+    funcs::TransCompute<phi::GPUContext, T>(
+        ndims, dev_ctx, trans_out, out, trans);
+  }
+}
+#undef FIXED_BLOCK_DIM_BASE
+#undef FIXED_BLOCK_DIM
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(top_k,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TopkKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu
new file mode 100644
index 0000000000000..bc3ef1bc623bb
--- /dev/null
+++ b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(tril_triu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TrilTriuGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/tril_triu_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_kernel.cu
new file mode 100644
index 0000000000000..8c48edf9eff25
--- /dev/null
+++ b/paddle/phi/kernels/gpu/tril_triu_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(tril_triu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TrilTriuKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
new file mode 100644
index 0000000000000..25d6d46c20b9f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
@@ -0,0 +1,402 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/viterbi_decode_kernel.h"
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+#ifdef PADDLE_WITH_MKLML
+#include <omp.h>
+#endif
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/viterbi_decode_functor.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
+  case (1 << (log2_block_dim)): {                       \
+    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
+    __VA_ARGS__;                                        \
+  } break
+
+#define FIXED_BLOCK_DIM_CASE(...)               \
+  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
+
+int64_t ComputeBlockSize(int64_t col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256)
+    return 512;
+  else if (col > 128)
+    return 256;
+  else if (col > 64)
+    return 128;
+  else if (col > 32)
+    return 64;
+  else if (col > 16)
+    return 32;
+  else if (col > 8)
+    return 16;
+  else
+    return 8;
+}
+
+template <typename Context,
+          template <typename T> typename BinaryFunctor,
+          typename T>
+struct BinaryOperation {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& lhs,
+                  const DenseTensor& rhs,
+                  DenseTensor* output) {
+    std::vector<const DenseTensor*> ins{&lhs, &rhs};
+    std::vector<DenseTensor*> outs{output};
+    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
+                                                   T,
+                                                   T>(
+        dev_ctx, ins, &outs, -1, BinaryFunctor<T>());
+  }
+};
+
+template <typename Context,
+          template <typename InT, typename OutT> typename CompareFunctor,
+          typename T>
+struct GetMask {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& lhs,
+                  const DenseTensor& rhs,
+                  DenseTensor* mask) {
+    std::vector<const DenseTensor*> ins = {&lhs, &rhs};
+    std::vector<DenseTensor*> outs = {mask};
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+        dev_ctx, ins, &outs, CompareFunctor<int64_t, T>());
+  }
+};
+
+template <typename T, typename IndType, size_t BlockDim>
+__global__ void ArgmaxCUDAKernel(const int64_t height,     // n * h
+                                 const int64_t width,      // c
+                                 const int64_t post_size,  // h
+                                 const T* in,
+                                 IndType* out_idx,
+                                 T* out) {
+  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  cub::ArgMax reducer;
+  T init = (std::numeric_limits<T>::lowest)();  // for windows compile
+  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
+    cub::KeyValuePair<int, T> kv_pair = {-1, init};
+    int h = idx / post_size;
+    int w = idx % post_size;
+    for (int k = threadIdx.x; k < width; k += blockDim.x) {
+      kv_pair =
+          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
+    }
+    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
+    if (threadIdx.x == 0) {
+      // return max, argmax
+      if (out_idx != nullptr) out_idx[idx] = static_cast<IndType>(kv_pair.key);
+      if (out != nullptr) out[idx] = kv_pair.value;
+    }
+    __syncthreads();
+  }
+}
+
+__global__ void ARangeKernel(int64_t* data, int num, int64_t scale) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int start = idx; idx < num; idx += gridDim.x) {
+    data[idx] = idx * scale;
+  }
+}
+
+template <typename Context>
+struct ARange {
+  void operator()(const Context& dev_ctx,
+                  int64_t* data,
+                  int num,
+                  int64_t scale) {
+    int64_t kBlockDim = ComputeBlockSize(num);
+    // kBlockDim > num at most of time, so we can set grid = 1
+    ARangeKernel<<<1, kBlockDim, 0, dev_ctx.stream()>>>(data, num, scale);
+  }
+};
+
+template <typename Context, typename T, typename IndType>
+struct Argmax {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  DenseTensor* out_idx,
+                  DenseTensor* out,
+                  int axis) {
+    phi::DDim input_dims = input.dims();
+    int64_t numel = input.numel();
+    int64_t groups = numel / input_dims[axis];
+    int64_t pre = 1;
+    int64_t post = 1;
+    int64_t n = input_dims[axis];
+    for (int i = 0; i < axis; i++) {
+      pre *= input_dims[i];
+    }
+    for (int i = axis + 1; i < input_dims.size(); i++) {
+      post *= input_dims[i];
+    }
+    auto cu_stream = dev_ctx.stream();
+    int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
+    int64_t height = pre * post;
+    int64_t width = n;
+    int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
+    const T* in_data = input.data<T>();
+    IndType* out_idx_data = out_idx->data<IndType>();
+    T* out_data = out->data<T>();
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgmaxCUDAKernel<T,
+                           IndType,
+                           kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height, width, post, in_data, out_idx_data, out_data));
+    }
+  }
+};
+
+template <typename Context, typename T>
+struct GetMaxValue {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  T* max_value) {
+    DenseTensor out_data;
+    out_data.Resize(phi::make_ddim({1}));
+    dev_ctx.template Alloc<T>(&out_data);
+    switch (ComputeBlockSize(input.numel())) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgmaxCUDAKernel<T,
+                           T,
+                           kBlockDim><<<1, kBlockDim, 0, dev_ctx.stream()>>>(
+              1,
+              input.numel(),
+              1,
+              input.data<int64_t>(),
+              nullptr,
+              out_data.data<int64_t>()));
+    }
+    DenseTensor max_value_tensor;
+    phi::Copy(dev_ctx, out_data, phi::CPUPlace(), false, &max_value_tensor);
+    *max_value = max_value_tensor.data<T>()[0];
+  }
+};
+
+template <typename Context, typename T, typename IndexT>
+struct Gather {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& src,
+                  const DenseTensor& index,
+                  DenseTensor* output) {
+    phi::funcs::GPUGather<T, IndexT>(dev_ctx, src, index, output);
+  }
+};
+
+template <typename T, typename Context>
+void ViterbiDecodeKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& transition,
+                         const DenseTensor& length,
+                         bool include_bos_eos_tag,
+                         DenseTensor* scores,
+                         DenseTensor* path) {
+  auto curr_place = dev_ctx.GetPlace();
+  auto batch_size = static_cast<int>(input.dims()[0]);
+  auto seq_len = static_cast<int>(input.dims()[1]);
+  auto n_labels = static_cast<int>(input.dims()[2]);
+  phi::funcs::SetConstant<Context, T> float_functor;
+  phi::funcs::SetConstant<Context, int64_t> int_functor;
+  std::vector<DenseTensor> historys;
+  // We create tensor buffer in order to avoid allocating memory frequently
+  // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
+  int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
+  DenseTensor int_buffer = Empty<int64_t>(dev_ctx, {buffer_size});
+  funcs::TensorBuffer int_tensor_buffer(int_buffer);
+  // create float tensor buffer
+  // 10 means allocate 10*batch_size*n_labels bytes, such as alpha, alpha_max
+  buffer_size = batch_size * (seq_len + 10) * n_labels +
+                (batch_size + 2) * n_labels * n_labels;
+  DenseTensor float_buffer = Empty<T>(dev_ctx, {buffer_size});
+  funcs::TensorBuffer float_tensor_buffer(float_buffer);
+  DenseTensor left_length = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  phi::Copy(dev_ctx, length, curr_place, false, &left_length);
+  int64_t max_seq_len = 0;
+  GetMaxValue<Context, int64_t> get_max_value;
+  get_max_value(dev_ctx, left_length, &max_seq_len);
+  dev_ctx.template Alloc<T>(scores);
+  path->Resize({batch_size, max_seq_len});
+  dev_ctx.template Alloc<int64_t>(path);
+  DenseTensor tpath =
+      int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
+  auto batch_path = funcs::Unbind(tpath);
+  for (auto it = batch_path.begin(); it != batch_path.end(); ++it) {
+    it->Resize({batch_size});
+  }
+  // create and init required tensor
+  DenseTensor input_exp =
+      float_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
+  TransposeKernel<T, Context>(dev_ctx, input, {1, 0, 2}, &input_exp);
+  DenseTensor trans_exp =
+      float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
+  phi::Copy(dev_ctx, transition, curr_place, false, &trans_exp);
+  trans_exp.Resize({1, n_labels, n_labels});
+  DenseTensor alpha =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  int_functor(dev_ctx, &zero, 0);
+  DenseTensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  int_functor(dev_ctx, &one, 1);
+  DenseTensor float_one = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+  float_functor(dev_ctx, &float_one, static_cast<T>(1.0));
+  DenseTensor alpha_trn_sum =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels, n_labels});
+  DenseTensor alpha_max =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor alpha_argmax =
+      int_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
+  auto alpha_argmax_unbind = funcs::Unbind(alpha_argmax);
+  DenseTensor alpha_nxt =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor zero_len_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor float_mask = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+  DenseTensor stop_trans = float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+  DenseTensor start_trans =
+      float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+  DenseTensor rest_trans =
+      float_tensor_buffer.GetBufferBlock({1, n_labels - 2, n_labels});
+  DenseTensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor last_ids_tmp = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor batch_offset = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor gather_idx = int_tensor_buffer.GetBufferBlock({batch_size});
+  std::vector<const DenseTensor*> shape{&rest_trans, &stop_trans, &start_trans};
+  std::vector<DenseTensor*> outputs{&rest_trans, &stop_trans, &start_trans};
+  phi::funcs::SplitFunctor<Context, T> split_functor;
+  split_functor(dev_ctx, trans_exp, shape, 1, &outputs);
+  stop_trans.Resize({1, n_labels});
+  start_trans.Resize({1, n_labels});
+  auto logit0 = input_exp.Slice(0, 1);
+  logit0.Resize({batch_size, n_labels});
+  BinaryOperation<Context, phi::funcs::AddFunctor, T> AddFloat;
+  BinaryOperation<Context, phi::funcs::AddFunctor, int64_t> AddInt;
+  BinaryOperation<Context, phi::funcs::MultiplyFunctor, T> MulFloat;
+  BinaryOperation<Context, phi::funcs::MultiplyFunctor, int64_t> MulInt;
+  BinaryOperation<Context, phi::funcs::SubtractFunctor, T> SubFloat;
+  BinaryOperation<Context, phi::funcs::SubtractFunctor, int64_t> SubInt;
+  if (include_bos_eos_tag) {
+    AddFloat(dev_ctx, logit0, start_trans, &alpha);
+    GetMask<Context, phi::funcs::EqualFunctor, T>()(
+        dev_ctx, left_length, one, &float_mask);
+    MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
+    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+  } else {
+    alpha = logit0;
+  }
+  SubInt(dev_ctx, left_length, one, &left_length);
+  Argmax<Context, T, int64_t> argmax;
+  for (int64_t i = 1; i < max_seq_len; ++i) {
+    DenseTensor logit = input_exp.Slice(i, i + 1);
+    logit.Resize({batch_size, n_labels});
+    DenseTensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
+    AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
+    auto alpha_argmax_temp = alpha_argmax_unbind[i - 1];
+    alpha_argmax_temp.Resize({batch_size, n_labels});
+    argmax(dev_ctx, alpha_trn_sum, &alpha_argmax_temp, &alpha_max, 1);
+    historys.emplace_back(alpha_argmax_temp);
+    AddFloat(dev_ctx, alpha_max, logit, &alpha_nxt);
+    alpha.Resize({batch_size, n_labels});
+    GetMask<Context, phi::funcs::GreaterThanFunctor, T>()(
+        dev_ctx, left_length, zero, &float_mask);
+    MulFloat(dev_ctx, alpha_nxt, float_mask, &alpha_nxt);
+    SubFloat(dev_ctx, float_one, float_mask, &float_mask);
+    MulFloat(dev_ctx, alpha, float_mask, &alpha);
+    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    if (include_bos_eos_tag) {
+      GetMask<Context, phi::funcs::EqualFunctor, T>()(
+          dev_ctx, left_length, one, &float_mask);
+      MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
+      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    }
+    SubInt(dev_ctx, left_length, one, &left_length);
+  }
+  argmax(dev_ctx, alpha, &last_ids, scores, 1);
+  left_length.Resize({batch_size});
+  GetMask<Context, phi::funcs::GreaterEqualFunctor, int64_t>()(
+      dev_ctx, left_length, zero, &int_mask);
+  // last_ids_update = last_ids * tag_mask
+  int last_ids_index = 1;
+  int actual_len = (std::min)(seq_len, static_cast<int>(max_seq_len));
+  MulInt(dev_ctx, last_ids, int_mask, &batch_path[actual_len - last_ids_index]);
+  // The algorithm below can refer to
+  // https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/layers/crf.py#L438
+  ARange<Context> arange;
+  arange(dev_ctx, batch_offset.data<int64_t>(), batch_size, n_labels);
+  Gather<Context, int64_t, int64_t> gather;
+  for (auto hist = historys.rbegin(); hist != historys.rend(); ++hist) {
+    ++last_ids_index;
+    AddInt(dev_ctx, left_length, one, &left_length);
+    AddInt(dev_ctx, batch_offset, last_ids, &gather_idx);
+    DenseTensor& last_ids_update = batch_path[actual_len - last_ids_index];
+    hist->Resize({batch_size * n_labels});
+    gather(dev_ctx, *hist, gather_idx, &last_ids_update);
+    GetMask<Context, phi::funcs::GreaterThanFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &int_mask);
+    MulInt(dev_ctx, last_ids_update, int_mask, &last_ids_update);
+    GetMask<Context, phi::funcs::EqualFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &zero_len_mask);
+    MulInt(dev_ctx, last_ids, zero_len_mask, &last_ids_tmp);
+    SubInt(dev_ctx, one, zero_len_mask, &zero_len_mask);
+    MulInt(dev_ctx, last_ids_update, zero_len_mask, &last_ids_update);
+    AddInt(dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update);
+    GetMask<Context, phi::funcs::LessThanFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &int_mask);
+    MulInt(dev_ctx, last_ids, int_mask, &last_ids);
+    AddInt(dev_ctx, last_ids_update, last_ids, &last_ids);
+  }
+  TransposeKernel<int64_t, Context>(dev_ctx, tpath, {1, 0}, path);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    viterbi_decode, GPU, ALL_LAYOUT, phi::ViterbiDecodeKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/where_index_kernel.cu b/paddle/phi/kernels/gpu/where_index_kernel.cu
new file mode 100644
index 0000000000000..9538533f70d59
--- /dev/null
+++ b/paddle/phi/kernels/gpu/where_index_kernel.cu
@@ -0,0 +1,87 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/select_impl.cu.h"
+#include "paddle/phi/kernels/where_index_kernel.h"
+
+namespace phi {
+template <typename T1, typename T2, typename OutT>
+struct IndexFunctor {
+  T2 stride[phi::DDim::kMaxRank];
+  int dims;
+  explicit IndexFunctor(const phi::DDim &in_dims) {
+    dims = in_dims.size();
+    std::vector<T2> strides_in_tmp;
+    strides_in_tmp.resize(dims, 1);
+    // get strides according to in_dims
+    for (T2 i = 1; i < dims; i++) {
+      strides_in_tmp[i] = strides_in_tmp[i - 1] * in_dims[dims - i];
+    }
+    memcpy(stride, strides_in_tmp.data(), dims * sizeof(T2));
+  }
+
+  HOSTDEVICE inline void operator()(OutT *out,
+                                    const T1 *mask,
+                                    const T2 *index,
+                                    const int num) {
+    int store_fix = 0;
+    for (int idx = 0; idx < num; idx++) {
+      if (mask[idx]) {
+        T2 data_index = index[idx];
+        // get index
+        for (int rank_id = dims - 1; rank_id >= 0; --rank_id) {
+          out[store_fix] = static_cast<OutT>(data_index / stride[rank_id]);
+          data_index = data_index % stride[rank_id];
+          store_fix++;
+        }
+      }
+    }
+  }
+};
+
+template <typename T, typename Context>
+void WhereIndexKernel(const Context &dev_ctx,
+                      const DenseTensor &condition,
+                      DenseTensor *out) {
+  DenseTensor in_data;
+  auto dims = condition.dims();
+  using Functor = IndexFunctor<T, int64_t, int64_t>;
+  Functor index_functor = Functor(dims);
+  phi::funcs::SelectKernel<T, T, int64_t, 0, Functor>(
+      dev_ctx, condition, in_data, out, index_functor);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(where_index,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::WhereIndexKernel,
+                   int64_t,
+                   int,
+                   int16_t,
+                   bool,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
new file mode 100644
index 0000000000000..2893bd74b1bce
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
@@ -0,0 +1,1122 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/backends/dynload/cudnn.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+#include "paddle/phi/kernels/funcs/slice.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/conv_miopen_helper.h"
+#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
+#else
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
+#endif
+
+namespace phi {
+
+using GPUDNNDataLayout = paddle::platform::DataLayout;
+
+template <typename T, typename Context>
+void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
+                                      const DenseTensor& x,
+                                      const DenseTensor& filter,
+                                      const DenseTensor& dout,
+                                      const std::vector<int>& strides,
+                                      const std::vector<int>& paddings,
+                                      const std::string& padding_algorithm,
+                                      int groups,
+                                      const std::vector<int>& dilations,
+                                      const std::string& data_format,
+                                      DenseTensor* dx,
+                                      DenseTensor* dfilter) {
+  const T* filter_data = filter.data<T>();
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ =
+      dilations;  // cudnn v5 does not support dilations
+  const GPUDNNDataLayout data_layout =
+      (data_format != "NHWC" ? GPUDNNDataLayout::kNCHW
+                             : GPUDNNDataLayout::kNHWC);
+
+  // if channel_last, transpose to channel_first
+  DenseTensor x_transpose;
+  DenseTensor dout_transpose;
+  std::vector<int> x_vec = vectorize<int>(x.dims());
+  std::vector<int> out_vec = vectorize<int>(dout.dims());
+  if (data_layout == GPUDNNDataLayout::kNHWC) {
+    if (strides.size() == 2U) {
+      std::vector<int> axis = {0, 3, 1, 2};
+      for (size_t i = 0; i < axis.size(); ++i) {
+        x_vec[i] = x.dims()[axis[i]];
+        out_vec[i] = dout.dims()[axis[i]];
+      }
+      x_transpose = Transpose<T, Context>(ctx, x, axis);
+      dout_transpose = Transpose<T, Context>(ctx, dout, axis);
+    } else if (strides.size() == 3U) {
+      std::vector<int> axis = {0, 4, 1, 2, 3};
+      for (size_t i = 0; i < axis.size(); ++i) {
+        x_vec[i] = x.dims()[axis[i]];
+        out_vec[i] = dout.dims()[axis[i]];
+      }
+      x_transpose = Transpose<T, Context>(ctx, x, axis);
+      dout_transpose = Transpose<T, Context>(ctx, dout, axis);
+    }
+  } else {
+    x_transpose = x;
+    dout_transpose = dout;
+  }
+
+  // update padding and dilation
+  auto x_dims = x_transpose.dims();
+  auto filter_dims = filter.dims();
+  DDim x_data_dims;
+  x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
+
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings_, data_dim);
+
+  std::vector<int> x_pad(x_dims.size() * 2, 0);
+  DenseTensor transformed_dout;
+  std::vector<int> padding_common(data_dim, 0);
+  if (!is_sys_pad) {
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_dout_shape_vec(data_dim + 2);
+    new_dout_shape_vec[0] = dout_transpose.dims()[0];
+    new_dout_shape_vec[1] = dout_transpose.dims()[1];
+
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings_[2 * i] - paddings_[2 * i + 1]);
+      padding_common[i] = std::min(paddings_[2 * i], paddings_[2 * i + 1]);
+      new_dout_shape_vec[i + 2] =
+          dout_transpose.dims()[i + 2] + padding_diff[i];
+      x_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i];
+      x_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i];
+    }
+
+    transformed_dout.Resize(make_ddim(new_dout_shape_vec));
+    ctx.template Alloc<T>(&transformed_dout);
+
+    const int rank = x_transpose.dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(
+            ctx, x_pad, dout_transpose, pad_value, &transformed_dout);
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(
+            ctx, x_pad, dout_transpose, pad_value, &transformed_dout);
+      } break;
+      default:
+        PADDLE_THROW(errors::InvalidArgument(
+            "Op(ConvTranspose) only supports 4-D or 5-D x DenseTensor."));
+    }
+  } else {
+    transformed_dout = dout_transpose;
+    if (paddings_.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[2 * i];
+      }
+    }
+  }
+
+  const T* x_data = x_transpose.data<T>();
+  const T* dout_data = transformed_dout.data<T>();
+  out_vec = vectorize<int>(transformed_dout.dims());
+
+  // ------------------- cudnn descriptors ---------------------
+  GPUDNNDataLayout layout;
+
+  if (strides.size() == 2U) {
+    layout = GPUDNNDataLayout::kNCHW;
+  } else {
+    layout = GPUDNNDataLayout::kNCDHW;
+  }
+
+  int iwo_groups = groups;
+  int c_groups = 1;
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+  iwo_groups = 1;
+  c_groups = groups;
+  groups = 1;
+#endif
+
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+
+  paddle::operators::ConvArgs args1{&transformed_dout,
+                                    &filter,
+                                    &x_transpose,
+                                    strides,
+                                    padding_common,
+                                    dilations_,
+                                    dtype};
+  paddle::operators::ConvArgs args2{&transformed_dout,
+                                    &filter,
+                                    &x_transpose,
+                                    strides,
+                                    padding_common,
+                                    dilations_,
+                                    dtype};
+
+#ifdef PADDLE_WITH_HIP
+  miopenConvFwdAlgorithm_t data_algo{};
+  miopenConvBwdWeightsAlgorithm_t filter_algo{};
+#else
+  cudnnConvolutionFwdAlgo_t data_algo{};
+  cudnnConvolutionBwdFilterAlgo_t filter_algo{};
+#endif
+
+  auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout);
+  size_t workspace_size = 0;
+  auto handle = ctx.cudnn_handle();
+  bool deterministic = FLAGS_cudnn_deterministic;
+  T* dx_data = nullptr;
+  T* dfilter_data = nullptr;
+
+  if (dx) {
+    dx_data = ctx.template Alloc<T>(dx);
+    args1.handle = handle;
+    args1.idesc.set(transformed_dout, iwo_groups);
+    args1.wdesc.set(filter, layout_tensor, iwo_groups);
+    args1.odesc.set(x_transpose, iwo_groups);
+    args1.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_groups);
+#ifdef PADDLE_WITH_HIP
+    using search1 =
+        paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1));
+    data_algo =
+        search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
+#else
+    using search1 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+    data_algo = search1::Find<T>(args1, false, deterministic, ctx);
+    workspace_size =
+        std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
+#endif
+  }
+
+  if (dfilter) {
+    dfilter_data = ctx.template Alloc<T>(dfilter);
+    args2.handle = handle;
+    args2.idesc.set(transformed_dout, iwo_groups);
+    args2.wdesc.set(*dfilter, layout_tensor, iwo_groups);
+    args2.odesc.set(x_transpose, iwo_groups);
+    args2.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_groups);
+#ifdef PADDLE_WITH_HIP
+    using search2 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
+    filter_algo =
+        search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
+#else
+    using search2 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
+    filter_algo = search2::Find<T>(args2, false, deterministic, ctx);
+    workspace_size =
+        std::max(workspace_size, search2::GetWorkspaceSize(args2, filter_algo));
+#endif
+  }
+
+  // ------------------- cudnn conv backward data ---------------------
+  // FIxME(typhoonzero): template type T may not be the same as cudnn call.
+  int x_offset = x.numel() / x.dims()[0] / groups;
+  int dout_offset =
+      transformed_dout.numel() / transformed_dout.dims()[0] / groups;
+  int filter_offset = filter.numel() / groups;
+  paddle::operators::ScalingParamType<T> alpha = 1.0f;
+  paddle::operators::ScalingParamType<T> beta = 0.0f;
+  auto workspace_handle = ctx.cudnn_workspace_handle();
+  if (dx) {
+    // Because beta is zero, it is unnecessary to reset dx.
+    for (int g = 0; g < groups; g++) {
+#ifdef PADDLE_WITH_HIP
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            dynload::miopenConvolutionForward(handle,
+                                              &alpha,
+                                              args1.idesc.desc(),
+                                              dout_data + dout_offset * g,
+                                              args1.wdesc.desc(),
+                                              filter_data + filter_offset * g,
+                                              args1.cdesc.desc(),
+                                              data_algo,
+                                              &beta,
+                                              args1.odesc.desc(),
+                                              dx_data + x_offset * g,
+                                              cudnn_workspace,
+                                              workspace_size));
+      };
+#else   // PADDLE_WITH_HIP
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            dynload::cudnnConvolutionForward(handle,
+                                             &alpha,
+                                             args1.idesc.desc(),
+                                             dout_data + dout_offset * g,
+                                             args1.wdesc.desc(),
+                                             filter_data + filter_offset * g,
+                                             args1.cdesc.desc(),
+                                             data_algo,
+                                             cudnn_workspace,
+                                             workspace_size,
+                                             &beta,
+                                             args1.odesc.desc(),
+                                             dx_data + x_offset * g));
+      };
+#endif  // PADDLE_WITH_HIP
+      workspace_handle.RunFunc(cudnn_func, workspace_size);
+    }
+
+    if (data_layout == GPUDNNDataLayout::kNHWC) {
+      DenseTensor dx_transpose;
+      DenseTensor dx_nchw;
+      dx_nchw.ShareDataWith(*dx);
+      dx_nchw.Resize(make_ddim(x_vec));
+      if (strides.size() == 2U) {
+        std::vector<int> axis = {0, 2, 3, 1};
+        dx_transpose = Transpose<T, Context>(ctx, dx_nchw, axis);
+        *dx = dx_transpose;
+      } else if (strides.size() == 3U) {
+        std::vector<int> axis = {0, 2, 3, 4, 1};
+        dx_transpose = Transpose<T, Context>(ctx, dx_nchw, axis);
+        *dx = dx_transpose;
+      }
+    }
+  }
+
+  // ------------------- cudnn conv backward filter ---------------------
+  if (dfilter) {
+    // Because beta is zero, it is unnecessary to reset dfilter.
+    // Gradient with respect to the filter
+    for (int g = 0; g < groups; g++) {
+#ifdef PADDLE_WITH_HIP
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardWeights(
+            handle,
+            &alpha,
+            args2.odesc.desc(),
+            x_data + x_offset * g,
+            args2.idesc.desc(),
+            dout_data + dout_offset * g,
+            args2.cdesc.desc(),
+            filter_algo,
+            &beta,
+            args2.wdesc.desc(),
+            dfilter_data + filter_offset * g,
+            cudnn_workspace,
+            workspace_size));
+      };
+#else   // PADDLE_WITH_HIP
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardFilter(
+            handle,
+            &alpha,
+            args2.idesc.desc(),
+            dout_data + dout_offset * g,
+            args2.odesc.desc(),
+            x_data + x_offset * g,
+            args2.cdesc.desc(),
+            filter_algo,
+            cudnn_workspace,
+            workspace_size,
+            &beta,
+            args2.wdesc.desc(),
+            dfilter_data + filter_offset * g));
+      };
+#endif  // PADDLE_WITH_HIP
+      workspace_handle.RunFunc(cudnn_func, workspace_size);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Conv2dTransposeGradGPUDNNKernel(const Context& ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& filter,
+                                     const DenseTensor& dout,
+                                     const std::vector<int>& strides,
+                                     const std::vector<int>& paddings_,
+                                     const std::vector<int>& output_padding,
+                                     const std::vector<int>& output_size,
+                                     const std::string& padding_algorithm,
+                                     int groups,
+                                     const std::vector<int>& dilations_,
+                                     const std::string& data_format,
+                                     DenseTensor* dx,
+                                     DenseTensor* dfilter) {
+  ConvTransposeGradRawGPUDNNKernel<T, Context>(ctx,
+                                               x,
+                                               filter,
+                                               dout,
+                                               strides,
+                                               paddings_,
+                                               padding_algorithm,
+                                               groups,
+                                               dilations_,
+                                               data_format,
+                                               dx,
+                                               dfilter);
+}
+
+/*
+ * Inputs:  I, filter, dout, ddI, ddfilter
+ * Outputs: ddout, dfilter, dI
+ * ddo = conv_bp_data(filter, ddI) + conv_bp_data(ddfilter, I)
+ * dfilter = conv_bp_filter(dout, ddI)
+ * dI = conv(dout, ddfilter)
+ */
+template <typename T, typename Context>
+void Conv2dTransposeDoubleGradGPUDNNKernel(
+    const Context& ctx,
+    const DenseTensor& x,
+    const DenseTensor& filter,
+    const DenseTensor& dout,
+    const DenseTensor& ddx,
+    const DenseTensor& ddfilter,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    const std::vector<int>& output_padding,
+    const std::vector<int>& output_size,
+    const std::string& padding_algorithm,
+    int groups,
+    const std::vector<int>& dilations,
+    const std::string& data_format,
+    DenseTensor* dx,
+    DenseTensor* dfilter,
+    DenseTensor* ddout) {
+  if (dx) {
+    ctx.template Alloc<T>(dx);
+  }
+  if (dfilter) {
+    ctx.template Alloc<T>(dfilter);
+  }
+  if (ddout) {
+    ctx.template Alloc<T>(ddout);
+    funcs::SetConstant<Context, T> set_zero;
+    set_zero(ctx, ddout, static_cast<T>(0));
+  }
+
+  const T* filter_ = filter.data<T>();
+  const T* dout_ = dout.data<T>();
+  const T* ddx_ = nullptr;
+  const T* ddfilter_ = nullptr;
+  T* dx_ = nullptr;
+  T* dfilter_ = nullptr;
+  T* ddout_ = nullptr;
+  T* transformed_dx_ = nullptr;
+
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ = dilations;
+
+  bool deterministic = FLAGS_cudnn_deterministic;
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  // transform DenseTensors to channel first-----------
+  DenseTensor transformed_x_channel(x.type());
+  DenseTensor transformed_dout_channel(dout.type());
+  DenseTensor transformed_ddx_channel(x.type());
+
+  DenseTensor transformed_dx_channel(x.type());
+  DenseTensor transformed_ddout_channel(dout.type());
+
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(ctx, &x, &transformed_x_channel);
+    TransToChannelFirst<Context, T>(ctx, &x, &transformed_x_channel);
+
+    ResizeToChannelFirst<Context, T>(ctx, &dout, &transformed_dout_channel);
+    TransToChannelFirst<Context, T>(ctx, &dout, &transformed_dout_channel);
+
+    ResizeToChannelFirst<Context, T>(ctx, &ddx, &transformed_ddx_channel);
+    TransToChannelFirst<Context, T>(ctx, &ddx, &transformed_ddx_channel);
+
+    if (dx) {
+      ResizeToChannelFirst<Context, T>(ctx, dx, &transformed_dx_channel);
+      ctx.template Alloc<T>(&transformed_dx_channel);
+    }
+    if (ddout) {
+      ResizeToChannelFirst<Context, T>(ctx, ddout, &transformed_ddout_channel);
+    }
+  } else {
+    transformed_x_channel = x;
+    transformed_dout_channel = dout;
+    transformed_ddx_channel = ddx;
+
+    if (dx) {
+      transformed_dx_channel = *dx;
+    }
+  }
+  std::vector<int> out_vec = vectorize<int>(transformed_dout_channel.dims());
+
+  auto x_dims = transformed_x_channel.dims();
+  auto filter_dims = filter.dims();
+  DDim x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
+
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings_, data_dim);
+  DenseTensor transformed_x(x.type());
+  DenseTensor transformed_ddx(x.type());
+
+  DenseTensor transformed_dout(dout.type());
+
+  std::vector<int> padding_common(data_dim, 0);
+  std::vector<int> input_pad(x.dims().size() * 2, 0);
+
+  if (!is_sys_pad) {
+    // get pad
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_input_shape_vec(data_dim + 2);
+    std::vector<int> new_output_grad_shape_vec(data_dim + 2);
+
+    new_input_shape_vec[0] = transformed_x_channel.dims()[0];
+    new_input_shape_vec[1] = transformed_x_channel.dims()[1];
+
+    new_output_grad_shape_vec[0] = transformed_dout_channel.dims()[0];
+    new_output_grad_shape_vec[1] = transformed_dout_channel.dims()[1];
+
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings_[2 * i] - paddings_[2 * i + 1]);
+      padding_common[i] = std::min(paddings_[2 * i], paddings_[2 * i + 1]);
+      new_input_shape_vec[i + 2] =
+          transformed_x_channel.dims()[i + 2] + padding_diff[i];
+
+      new_output_grad_shape_vec[i + 2] =
+          transformed_dout_channel.dims()[i + 2] + padding_diff[i];
+
+      input_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i];
+      input_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i];
+    }
+    DDim new_input_shape(make_ddim(new_input_shape_vec));
+    transformed_x.Resize(new_input_shape);
+    transformed_ddx.Resize(new_input_shape);
+    transformed_dout.Resize(make_ddim(new_output_grad_shape_vec));
+
+    ctx.template Alloc<T>(&transformed_x);
+    ctx.template Alloc<T>(&transformed_ddx);
+    ctx.template Alloc<T>(&transformed_dout);
+
+    // pad for input
+    const int rank = x.dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(
+            ctx, input_pad, transformed_x_channel, pad_value, &transformed_x);
+        funcs::PadFunction<Context, T, 4>(ctx,
+                                          input_pad,
+                                          transformed_dout_channel,
+                                          pad_value,
+                                          &transformed_dout);
+        funcs::PadFunction<Context, T, 4>(ctx,
+                                          input_pad,
+                                          transformed_ddx_channel,
+                                          pad_value,
+                                          &transformed_ddx);
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(
+            ctx, input_pad, transformed_x_channel, pad_value, &transformed_x);
+        funcs::PadFunction<Context, T, 5>(ctx,
+                                          input_pad,
+                                          transformed_ddx_channel,
+                                          pad_value,
+                                          &transformed_ddx);
+      } break;
+      default:
+        PADDLE_THROW(errors::InvalidArgument(
+            "ConvOp only support tensors with 4 or 5 dimensions."));
+    }
+  } else {
+    transformed_x = transformed_x_channel;
+    transformed_dout = transformed_dout_channel;
+    transformed_ddx = transformed_ddx_channel;
+
+    if (paddings_.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[2 * i];
+      }
+    }
+  }
+
+  std::vector<int64_t> starts(data_dim, 0);
+  std::vector<int64_t> ends(data_dim, 0);
+  std::vector<int64_t> axes(data_dim, 0);
+  for (size_t i = 0; i < data_dim; ++i) {
+    starts[i] = input_pad[2 * i + 4] * (strides[i] + 1);
+    ends[i] = starts[i] + out_vec[i + 2];
+    axes[i] = i + 2;
+  }
+
+  std::vector<int> transformed_out_vec = out_vec;
+  for (size_t i = 0; i < data_dim; ++i) {
+    transformed_out_vec[i + 2] =
+        out_vec[i + 2] +
+        (input_pad[2 * i + 4] + input_pad[2 * i + 5]) * strides[i] -
+        2 * padding_common[i] + paddings_[2 * i] + paddings_[2 * i + 1];
+  }
+
+  if (!is_sys_pad) {
+    transformed_ddout_channel.Resize(make_ddim(transformed_out_vec));
+    ctx.template Alloc<T>(&transformed_ddout_channel);
+  } else {
+    ctx.template Alloc<T>(ddout);
+    transformed_ddout_channel = *ddout;
+    transformed_ddout_channel.Resize(make_ddim(transformed_out_vec));
+  }
+
+  const T* x_ = transformed_x.data<T>();
+
+  int iwo_group = groups;
+  int c_group = 1;
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+  iwo_group = 1;
+  c_group = groups;
+  groups = 1;
+#endif
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+
+  auto handle = ctx.cudnn_handle();
+
+  paddle::operators::ConvArgs args1{&transformed_ddout_channel,
+                                    &filter,
+                                    &transformed_ddx,
+                                    strides,
+                                    padding_common,
+                                    dilations_,
+                                    dtype};
+  paddle::operators::ConvArgs args2{&transformed_ddout_channel,
+                                    &ddfilter,
+                                    &transformed_x,
+                                    strides,
+                                    padding_common,
+                                    dilations_,
+                                    dtype};
+
+  paddle::operators::ConvArgs args3{&transformed_dout,
+                                    dfilter,
+                                    &transformed_ddx_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations_,
+                                    dtype};
+  paddle::operators::ConvArgs args4{&transformed_dout,
+                                    &ddfilter,
+                                    &transformed_dx_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations_,
+                                    dtype};
+#ifdef PADDLE_WITH_HIP
+  miopenConvBwdDataAlgorithm_t bwd_algo1 =
+      static_cast<miopenConvBwdDataAlgorithm_t>(0);
+  miopenConvBwdDataAlgorithm_t bwd_algo2 =
+      static_cast<miopenConvBwdDataAlgorithm_t>(0);
+  miopenConvFwdAlgorithm_t data_algo = static_cast<miopenConvFwdAlgorithm_t>(0);
+  miopenConvBwdWeightsAlgorithm_t filter_algo =
+      static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
+#else
+  cudnnConvolutionBwdDataAlgo_t bwd_algo1 =
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
+  cudnnConvolutionBwdDataAlgo_t bwd_algo2 =
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
+  cudnnConvolutionFwdAlgo_t data_algo =
+      static_cast<cudnnConvolutionFwdAlgo_t>(0);
+  cudnnConvolutionBwdFilterAlgo_t filter_algo =
+      static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
+#endif
+
+  auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW);
+
+  // ddo = conv(ddI, filter) + conv(I, ddfilter)
+  size_t workspace_size = 0;
+
+  T* transformed_ddout_channel_ = nullptr;
+
+  if (ddout) {
+    ddout_ = ddout->data<T>();
+    transformed_ddout_channel_ = transformed_ddout_channel.data<T>();
+
+    args1.handle = handle;
+    args1.idesc.set(transformed_ddout_channel, iwo_group);
+    args1.wdesc.set(filter, layout, iwo_group);
+    args1.odesc.set(transformed_ddx, iwo_group);
+    args1.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search1 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+    workspace_size = search1::GetWorkspaceSize(args1);
+    bwd_algo1 =
+        search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
+#else
+    using search1 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
+    bwd_algo1 = search1::Find<T>(args1, false, deterministic, ctx);
+    workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1);
+#endif
+
+    ddfilter_ = ddfilter.data<T>();
+    args2.handle = handle;
+    args2.idesc.set(transformed_ddout_channel, iwo_group);
+    args2.wdesc.set(ddfilter, layout, iwo_group);
+    args2.odesc.set(transformed_x, iwo_group);
+    args2.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search2 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
+    bwd_algo2 =
+        search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
+#else
+    using search2 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
+    bwd_algo2 = search2::Find<T>(args2, false, deterministic, ctx);
+    workspace_size =
+        std::max(workspace_size, search2::GetWorkspaceSize(args2, bwd_algo2));
+#endif
+  }
+
+  if (dfilter) {
+    dfilter_ = dfilter->data<T>();
+    args3.handle = handle;
+    args3.idesc.set(transformed_dout, iwo_group);
+    args3.wdesc.set(*dfilter, layout, iwo_group);
+
+    args3.odesc.set(transformed_ddx_channel, iwo_group);
+
+    args3.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search3 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
+    filter_algo =
+        search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
+#else
+    using search3 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
+    filter_algo = search3::Find<T>(args3, false, deterministic, ctx);
+    workspace_size =
+        std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo));
+#endif
+  }
+
+  if (dx) {
+    transformed_dx_ = transformed_dx_channel.data<T>();
+
+    args4.handle = handle;
+    args4.idesc.set(transformed_dout, iwo_group);
+    args4.wdesc.set(ddfilter, layout, iwo_group);
+    args4.odesc.set(transformed_dx_channel, iwo_group);
+    args4.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search4 =
+        paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
+    data_algo =
+        search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
+#else
+    using search4 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+    data_algo = search4::Find<T>(args4, false, deterministic, ctx);
+    workspace_size =
+        std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
+#endif
+  }
+
+  int i_n, i_c, i_d, i_h, i_w;
+  paddle::operators::GetNCDHW(transformed_x.dims(),
+                              GPUDNNDataLayout::kNCHW,
+                              &i_n,
+                              &i_c,
+                              &i_d,
+                              &i_h,
+                              &i_w);
+
+  int o_n, o_c, o_d, o_h, o_w;
+  paddle::operators::GetNCDHW(transformed_dout.dims(),
+                              GPUDNNDataLayout::kNCHW,
+                              &o_n,
+                              &o_c,
+                              &o_d,
+                              &o_h,
+                              &o_w);
+
+  int group_offset_in =
+      transformed_x.numel() / transformed_x.dims()[0] / groups;
+  int group_offset_out =
+      transformed_dout.numel() / transformed_dout.dims()[0] / groups;
+  int group_offset_filter = filter.numel() / groups;
+
+  paddle::operators::ScalingParamType<T> alpha = 1.0f;
+  paddle::operators::ScalingParamType<T> beta = 0.0f;
+
+  auto wkspace_handle = ctx.cudnn_workspace_handle();
+
+  if (ddout) {
+    ddx_ = transformed_ddx.data<T>();
+    for (int i = 0; i < groups; i++) {
+#ifdef PADDLE_WITH_HIP
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
+                handle,
+                &alpha,
+                args1.odesc.desc(),
+                ddx_ + i * group_offset_in,
+                args1.wdesc.desc(),
+                filter_ + i * group_offset_filter,
+                args1.cdesc.desc(),
+                bwd_algo1,
+                &beta,
+                args1.idesc.desc(),
+                transformed_ddout_channel_ + i * group_offset_out,
+                workspace_ptr,
+                workspace_size));
+          },
+          workspace_size);
+#else   // PADDLE_WITH_HIP
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData(
+                handle,
+                &alpha,
+                args1.wdesc.desc(),
+                filter_ + i * group_offset_filter,
+                args1.odesc.desc(),
+                ddx_ + i * group_offset_in,
+                args1.cdesc.desc(),
+                bwd_algo1,
+                workspace_ptr,
+                workspace_size,
+                &beta,
+                args1.idesc.desc(),
+                transformed_ddout_channel_ + i * group_offset_out));
+          },
+          workspace_size);
+#endif  // PADDLE_WITH_HIP
+    }
+
+    for (int i = 0; i < groups; i++) {
+#ifdef PADDLE_WITH_HIP
+      // MIOPEN ONLY support beta to be 0.0f
+      DenseTensor conv_x_ddfilter(dout.type());
+      conv_x_ddfilter.Resize(transformed_ddout_channel.dims());
+      T* conv_x_ddfilter_data = ctx.template Alloc<T>(&conv_x_ddfilter);
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
+                handle,
+                &alpha,
+                args2.odesc.desc(),
+                x_ + i * group_offset_in,
+                args2.wdesc.desc(),
+                ddfilter_ + i * group_offset_filter,
+                args2.cdesc.desc(),
+                bwd_algo2,
+                &beta,
+                args2.idesc.desc(),
+                conv_x_ddfilter_data + i * group_offset_out,
+                workspace_ptr,
+                workspace_size));
+          },
+          workspace_size);
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenOpTensor(
+          handle,
+          miopenTensorOpAdd,
+          &alpha,
+          args2.idesc.desc(),
+          transformed_ddout_channel_ + i * group_offset_out,
+          &alpha,
+          args2.idesc.desc(),
+          conv_x_ddfilter_data + i * group_offset_out,
+          &beta,
+          args2.idesc.desc(),
+          transformed_ddout_channel_ + i * group_offset_out));
+#else   // PADDLE_WITH_HIP
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData(
+                handle,
+                &alpha,
+                args2.wdesc.desc(),
+                ddfilter_ + i * group_offset_filter,
+                args2.odesc.desc(),
+                x_ + i * group_offset_in,
+                args2.cdesc.desc(),
+                bwd_algo2,
+                workspace_ptr,
+                workspace_size,
+                &alpha,
+                args2.idesc.desc(),
+                transformed_ddout_channel_ + i * group_offset_out));
+          },
+          workspace_size);
+#endif  // PADDLE_WITH_HIP
+    }
+
+    if ((!is_sys_pad) && (!channel_last)) {
+      if (strides.size() == 2U) {
+        funcs::Slice<Context, T, 4>(
+            ctx, &transformed_ddout_channel, ddout, starts, ends, axes);
+      } else if (!is_sys_pad && strides.size() == 3U) {
+        funcs::Slice<Context, T, 5>(
+            ctx, &transformed_ddout_channel, ddout, starts, ends, axes);
+      }
+    } else if ((!is_sys_pad) && (channel_last)) {
+      if (strides.size() == 2U) {
+        funcs::Slice<Context, T, 4>(ctx,
+                                    &transformed_ddout_channel,
+                                    &transformed_ddout_channel,
+                                    starts,
+                                    ends,
+                                    axes);
+      } else if (!is_sys_pad && strides.size() == 3U) {
+        funcs::Slice<Context, T, 5>(ctx,
+                                    &transformed_ddout_channel,
+                                    &transformed_ddout_channel,
+                                    starts,
+                                    ends,
+                                    axes);
+      }
+
+      TransToChannelLast<Context, T>(ctx, &transformed_ddout_channel, ddout);
+    }
+  }
+
+  T* transformed_dout_channel_ = transformed_dout.data<T>();
+  if (dfilter) {
+    ddx_ = transformed_ddx_channel.data<T>();
+    for (int i = 0; i < groups; i++) {
+#ifdef PADDLE_WITH_HIP
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                dynload::miopenConvolutionBackwardWeights(
+                    handle,
+                    &alpha,
+                    args3.odesc.desc(),
+                    ddx_ + i * group_offset_in,
+                    args3.idesc.desc(),
+                    transformed_dout_channel_ + i * group_offset_out,
+                    args3.cdesc.desc(),
+                    filter_algo,
+                    &beta,
+                    args3.wdesc.desc(),
+                    dfilter_ + i * group_offset_filter,
+                    workspace_ptr,
+                    workspace_size));
+          },
+          workspace_size);
+#else   // PADDLE_WITH_HIP
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardFilter(
+                handle,
+                &alpha,
+                args3.idesc.desc(),
+                transformed_dout_channel_ + i * group_offset_out,
+                args3.odesc.desc(),
+                ddx_ + i * group_offset_in,
+                args3.cdesc.desc(),
+                filter_algo,
+                workspace_ptr,
+                workspace_size,
+                &beta,
+                args3.wdesc.desc(),
+                dfilter_ + i * group_offset_filter));
+          },
+          workspace_size);
+#endif  // PADDLE_WITH_HIP
+    }
+  }
+
+  if (dx) {
+    ddfilter_ = ddfilter.data<T>();
+    for (int i = 0; i < groups; i++) {
+#ifdef PADDLE_WITH_HIP
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionForward(
+                handle,
+                &alpha,
+                args4.idesc.desc(),
+                transformed_dout_channel_ + i * group_offset_out,
+                args4.wdesc.desc(),
+                ddfilter_ + i * group_offset_filter,
+                args4.cdesc.desc(),
+                data_algo,
+                &beta,
+                args4.odesc.desc(),
+                transformed_dx_ + i * group_offset_in,
+                workspace_ptr,
+                workspace_size));
+          },
+          workspace_size);
+#else   // PADDLE_WITH_HIP
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionForward(
+                handle,
+                &alpha,
+                args4.idesc.desc(),
+                transformed_dout_channel_ + i * group_offset_out,
+                args4.wdesc.desc(),
+                ddfilter_ + i * group_offset_filter,
+                args4.cdesc.desc(),
+                data_algo,
+                workspace_ptr,
+                workspace_size,
+                &beta,
+                args4.odesc.desc(),
+                transformed_dx_ + i * group_offset_in));
+          },
+          workspace_size);
+#endif  // PADDLE_WITH_HIP
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(ctx, &transformed_dx_channel, dx);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Conv3dTransposeGradGPUDNNKernel(const Context& ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& filter,
+                                     const DenseTensor& dout,
+                                     const std::vector<int>& strides,
+                                     const std::vector<int>& paddings_,
+                                     const std::vector<int>& output_padding,
+                                     const std::vector<int>& output_size,
+                                     const std::string& padding_algorithm,
+                                     int groups,
+                                     const std::vector<int>& dilations_,
+                                     const std::string& data_format,
+                                     DenseTensor* dx,
+                                     DenseTensor* dfilter) {
+  ConvTransposeGradRawGPUDNNKernel<T, Context>(ctx,
+                                               x,
+                                               filter,
+                                               dout,
+                                               strides,
+                                               paddings_,
+                                               padding_algorithm,
+                                               groups,
+                                               dilations_,
+                                               data_format,
+                                               dx,
+                                               dfilter);
+}
+
+}  // namespace phi
+
+using float16 = phi::dtype::float16;
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(conv2d_transpose_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeGradGPUDNNKernel,
+                   float,
+                   float16) {}
+PD_REGISTER_KERNEL(conv2d_transpose_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeDoubleGradGPUDNNKernel,
+                   float,
+                   float16) {}
+PD_REGISTER_KERNEL(conv3d_transpose_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3dTransposeGradGPUDNNKernel,
+                   float,
+                   float16) {}
+#else
+PD_REGISTER_KERNEL(conv2d_transpose_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeGradGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+PD_REGISTER_KERNEL(conv2d_transpose_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeDoubleGradGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+PD_REGISTER_KERNEL(conv3d_transpose_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3dTransposeGradGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
new file mode 100644
index 0000000000000..5de2df4a70c88
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
@@ -0,0 +1,381 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/conv_transpose_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/backends/dynload/cudnn.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+#include "paddle/phi/kernels/funcs/slice.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/conv_miopen_helper.h"
+#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
+#else
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
+#endif
+
+namespace phi {
+
+using GPUDNNDataLayout = paddle::platform::DataLayout;
+
+template <typename T, typename Context>
+void ConvTransposeRawGPUDNNKernel(const Context& ctx,
+                                  const DenseTensor& x,
+                                  const DenseTensor& filter,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  const std::string& padding_algorithm,
+                                  int groups,
+                                  const std::vector<int>& dilations,
+                                  const std::string& data_format,
+                                  DenseTensor* out) {
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ =
+      dilations;  // cudnn v5 does not support dilations
+  const T* filter_data = filter.data<T>();
+  const GPUDNNDataLayout data_layout =
+      (data_format != "NHWC" ? GPUDNNDataLayout::kNCHW
+                             : GPUDNNDataLayout::kNHWC);
+  std::vector<int> x_vec = vectorize<int>(x.dims());
+  std::vector<int> out_vec = vectorize<int>(out->dims());
+  // if channel_last, transpose to channel_first
+  DenseTensor x_transpose;
+  if (data_layout == GPUDNNDataLayout::kNHWC) {
+    if (strides.size() == 2U) {
+      std::vector<int> axis = {0, 3, 1, 2};
+      for (size_t i = 0; i < axis.size(); ++i) {
+        x_vec[i] = x.dims()[axis[i]];
+        out_vec[i] = out->dims()[axis[i]];
+      }
+      x_transpose = Transpose<T, Context>(ctx, x, axis);
+    } else if (strides.size() == 3U) {
+      std::vector<int> axis = {0, 4, 1, 2, 3};
+      for (size_t i = 0; i < axis.size(); ++i) {
+        x_vec[i] = x.dims()[axis[i]];
+        out_vec[i] = out->dims()[axis[i]];
+      }
+      x_transpose = Transpose<T, Context>(ctx, x, axis);
+    }
+  } else {
+    x_transpose = x;
+  }
+
+  // update padding and dilation
+  auto x_dims = x_transpose.dims();
+  auto filter_dims = filter.dims();
+  DDim x_data_dims;
+  x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
+
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings_, data_dim);
+
+  std::vector<int> x_pad(x_dims.size() * 2, 0);
+  DenseTensor transformed_x;
+  std::vector<int> padding_common(data_dim, 0);
+  if (!is_sys_pad) {
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_x_shape_vec(data_dim + 2);
+    new_x_shape_vec[0] = x_dims[0];
+    new_x_shape_vec[1] = x_dims[1];
+
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings_[2 * i] - paddings_[2 * i + 1]);
+      padding_common[i] = std::min(paddings_[2 * i], paddings_[2 * i + 1]);
+      new_x_shape_vec[i + 2] = x_dims[i + 2] + padding_diff[i];
+      x_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i];
+      x_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i];
+    }
+    DDim new_x_shape(make_ddim(new_x_shape_vec));
+    transformed_x.Resize(new_x_shape);
+    ctx.template Alloc<T>(&transformed_x);
+
+    const int rank = x_dims.size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(
+            ctx, x_pad, x_transpose, pad_value, &transformed_x);
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(
+            ctx, x_pad, x_transpose, pad_value, &transformed_x);
+      } break;
+      default:
+        PADDLE_THROW(errors::InvalidArgument(
+            "Op(ConvTranspose) only supports 4-D or 5-D x DenseTensor."));
+    }
+  } else {
+    transformed_x = x_transpose;
+    if (paddings_.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[2 * i];
+      }
+    }
+  }
+
+  std::vector<int64_t> starts(data_dim, 0);
+  std::vector<int64_t> ends(data_dim, 0);
+  std::vector<int64_t> axes(data_dim, 0);
+  for (size_t i = 0; i < data_dim; ++i) {
+    starts[i] = x_pad[2 * i + 4] * (strides[i] + 1);
+    ends[i] = starts[i] + out_vec[i + 2];
+    axes[i] = i + 2;
+  }
+
+  const T* x_data = transformed_x.data<T>();
+  x_vec = vectorize<int>(transformed_x.dims());
+
+  std::vector<int> transformed_out_vec = out_vec;
+  for (size_t i = 0; i < data_dim; ++i) {
+    transformed_out_vec[i + 2] =
+        out_vec[i + 2] + (x_pad[2 * i + 4] + x_pad[2 * i + 5]) * strides[i] -
+        2 * padding_common[i] + paddings_[2 * i] + paddings_[2 * i + 1];
+  }
+
+  DenseTensor transformed_out;
+  if (!is_sys_pad) {
+    transformed_out.Resize(make_ddim(transformed_out_vec));
+    ctx.template Alloc<T>(&transformed_out);
+  } else {
+    ctx.template Alloc<T>(out);
+    transformed_out.ShareDataWith(*out);
+    transformed_out.Resize(make_ddim(transformed_out_vec));
+  }
+  T* transformed_out_data = transformed_out.data<T>();
+
+  GPUDNNDataLayout layout;
+
+  int iwo_groups = groups;
+  int c_groups = 1;
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+  iwo_groups = 1;
+  c_groups = groups;
+  groups = 1;
+#endif
+
+  if (strides.size() == 2U) {
+    layout = GPUDNNDataLayout::kNCHW;
+  } else {
+    layout = GPUDNNDataLayout::kNCDHW;
+  }
+
+  size_t workspace_size = 0;
+#ifdef PADDLE_WITH_HIP
+  miopenConvBwdDataAlgorithm_t algo{};
+#else
+  cudnnConvolutionBwdDataAlgo_t algo{};
+#endif
+  // ------------------- cudnn conv algorithm ---------------------
+  auto handle = ctx.cudnn_handle();
+  auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout);
+  bool deterministic = FLAGS_cudnn_deterministic;
+
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+  // ------------------- cudnn descriptors ---------------------
+  paddle::operators::ConvArgs args{&transformed_out,
+                                   &filter,
+                                   &transformed_x,
+                                   strides,
+                                   padding_common,
+                                   dilations_,
+                                   dtype};
+  args.handle = handle;
+  args.idesc.set(transformed_out, iwo_groups);
+  args.wdesc.set(filter, layout_tensor, iwo_groups);
+  args.odesc.set(transformed_x, iwo_groups);
+  args.cdesc.set(dtype,
+                 padding_common,
+                 strides,
+                 dilations_,
+                 paddle::platform::AllowTF32Cudnn(),
+                 c_groups);
+
+#ifdef PADDLE_WITH_HIP
+  using search =
+      paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+  workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
+  algo = search::Find<T>(args, false, deterministic, workspace_size, ctx);
+#else
+  using search =
+      paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
+  algo = search::Find<T>(args, false, deterministic, ctx);
+  workspace_size =
+      std::max(workspace_size, search::GetWorkspaceSize(args, algo));
+#endif
+
+  // ------------------- cudnn conv transpose forward ---------------------
+  int x_offset = transformed_x.numel() / transformed_x.dims()[0] / groups;
+  int out_offset = transformed_out.numel() / transformed_out.dims()[0] / groups;
+  int filter_offset = filter.numel() / groups;
+  paddle::operators::ScalingParamType<T> alpha = 1.0f;
+  paddle::operators::ScalingParamType<T> beta = 0.0f;
+  auto workspace_handle = ctx.cudnn_workspace_handle();
+  for (int g = 0; g < groups; g++) {
+#ifdef PADDLE_WITH_HIP
+    auto cudnn_func = [&](void* cudnn_workspace) {
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
+          handle,
+          &alpha,
+          args.odesc.desc(),
+          x_data + x_offset * g,
+          args.wdesc.desc(),
+          filter_data + filter_offset * g,
+          args.cdesc.desc(),
+          algo,
+          &beta,
+          args.idesc.desc(),
+          transformed_out_data + out_offset * g,
+          cudnn_workspace,
+          workspace_size));
+    };
+#else   // PADDLE_WITH_HIP
+    auto cudnn_func = [&](void* cudnn_workspace) {
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData(
+          handle,
+          &alpha,
+          args.wdesc.desc(),
+          filter_data + filter_offset * g,
+          args.odesc.desc(),
+          x_data + x_offset * g,
+          args.cdesc.desc(),
+          algo,
+          cudnn_workspace,
+          workspace_size,
+          &beta,
+          args.idesc.desc(),
+          transformed_out_data + out_offset * g));
+    };
+#endif  // PADDLE_WITH_HIP
+    workspace_handle.RunFunc(cudnn_func, workspace_size);
+  }
+  if (!is_sys_pad && strides.size() == 2U) {
+    funcs::Slice<Context, T, 4>(ctx, &transformed_out, out, starts, ends, axes);
+  } else if (!is_sys_pad && strides.size() == 3U) {
+    funcs::Slice<Context, T, 5>(ctx, &transformed_out, out, starts, ends, axes);
+  }
+
+  if (data_layout == GPUDNNDataLayout::kNHWC) {
+    DenseTensor out_transpose;
+    DenseTensor out_nchw;
+    out_nchw.ShareDataWith(*out);
+    out_nchw.Resize(make_ddim(out_vec));
+
+    if (strides.size() == 2U) {
+      out_transpose = Transpose<T, Context>(ctx, out_nchw, {0, 2, 3, 1});
+    } else if (strides.size() == 3U) {
+      out_transpose = Transpose<T, Context>(ctx, out_nchw, {0, 2, 3, 4, 1});
+    }
+    *out = out_transpose;
+  }
+}
+
+template <typename T, typename Context>
+void Conv2dTransposeGPUDNNKernel(const Context& ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& filter,
+                                 const std::vector<int>& strides,
+                                 const std::vector<int>& paddings,
+                                 const std::vector<int>& output_padding,
+                                 const std::vector<int>& output_size,
+                                 const std::string& padding_algorithm,
+                                 int groups,
+                                 const std::vector<int>& dilations,
+                                 const std::string& data_format,
+                                 DenseTensor* out) {
+  ConvTransposeRawGPUDNNKernel<T, Context>(ctx,
+                                           x,
+                                           filter,
+                                           strides,
+                                           paddings,
+                                           padding_algorithm,
+                                           groups,
+                                           dilations,
+                                           data_format,
+                                           out);
+}
+
+template <typename T, typename Context>
+void Conv3dTransposeGPUDNNKernel(const Context& ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& filter,
+                                 const std::vector<int>& strides,
+                                 const std::vector<int>& paddings,
+                                 const std::vector<int>& output_padding,
+                                 const std::vector<int>& output_size,
+                                 const std::string& padding_algorithm,
+                                 int groups,
+                                 const std::vector<int>& dilations,
+                                 const std::string& data_format,
+                                 DenseTensor* out) {
+  ConvTransposeRawGPUDNNKernel<T, Context>(ctx,
+                                           x,
+                                           filter,
+                                           strides,
+                                           paddings,
+                                           padding_algorithm,
+                                           groups,
+                                           dilations,
+                                           data_format,
+                                           out);
+}
+
+}  // namespace phi
+
+using float16 = phi::dtype::float16;
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(conv2d_transpose,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeGPUDNNKernel,
+                   float,
+                   float16) {}
+PD_REGISTER_KERNEL(conv3d_transpose,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3dTransposeGPUDNNKernel,
+                   float,
+                   float16) {}
+#else
+PD_REGISTER_KERNEL(conv2d_transpose,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+PD_REGISTER_KERNEL(conv3d_transpose,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3dTransposeGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpudnn/pool_gpudnn.h b/paddle/phi/kernels/gpudnn/pool_gpudnn.h
new file mode 100644
index 0000000000000..0cf2c991464fc
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/pool_gpudnn.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
+namespace phi {
+
+using GPUDNNDataLayout = paddle::platform::DataLayout;
+using PoolingMode = paddle::platform::PoolingMode;
+using ScopedPoolingDescriptor = paddle::platform::ScopedPoolingDescriptor;
+using ScopedTensorDescriptor = paddle::platform::ScopedTensorDescriptor;
+
+template <typename T>
+using ScalingParamType =
+    typename paddle::platform::CudnnDataType<T>::ScalingParamType;
+
+inline GPUDNNDataLayout GetLayoutFromStr(std::string data_format) {
+  if (data_format == "NHWC") {
+    return GPUDNNDataLayout::kNHWC;
+  } else if (data_format == "NCHW") {
+    return GPUDNNDataLayout::kNCHW;
+  } else if (data_format == "NCDHW") {
+    return GPUDNNDataLayout::kNCDHW;
+  } else {
+    return GPUDNNDataLayout::kNCDHW;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
new file mode 100644
index 0000000000000..b731d03347024
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
@@ -0,0 +1,448 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/pool_grad_kernel.h"
+
+#include "paddle/phi/kernels/gpudnn/pool_gpudnn.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/pool_kernel.h"
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h"  //  PoolGradRawGPUDNNKernel will call PoolGradRawKernel for pooling type "max" in ROCm
+#endif
+
+namespace phi {
+
+template <typename T, typename Context>
+void PoolGradRawGPUDNNKernel(const Context& ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& out,
+                             const DenseTensor& dout,
+                             const std::vector<int>& kernel_size,
+                             const std::vector<int>& strides,
+                             const std::vector<int>& paddings,
+                             bool exclusive,
+                             const std::string& data_format,
+                             const std::string& pooling_type,
+                             bool global_pooling,
+                             bool adaptive,
+                             const std::string& padding_algorithm,
+                             DenseTensor* dx) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_gpu_place(ctx.GetPlace()),
+      true,
+      errors::InvalidArgument("Pool operator CUDA kernel must use CUDAPlace "
+                              "rather than CPUPlace."));
+
+  const DenseTensor* input = &x;
+  const DenseTensor* output = &out;
+  const DenseTensor* output_grad = &dout;
+  DenseTensor* input_grad = dx;
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+#ifdef PADDLE_WITH_HIP
+  if (pooling_type == "max") {
+    PoolGradRawKernel<T, GPUContext>(ctx,
+                                     x,
+                                     out,
+                                     dout,
+                                     kernel_size,
+                                     strides,
+                                     paddings_,
+                                     exclusive,
+                                     data_format,
+                                     pooling_type,
+                                     global_pooling,
+                                     adaptive,
+                                     padding_algorithm,
+                                     dx);
+    return;
+  }
+#endif
+
+  // update paddings
+  auto in_x_dims = input->dims();
+  DDim data_dims;
+  if (channel_last) {
+    data_dims = slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
+  } else {
+    data_dims = slice_ddim(in_x_dims, 2, in_x_dims.size());
+  }
+  funcs::UpdatePadding(&paddings_,
+                       global_pooling,
+                       adaptive,
+                       padding_algorithm,
+                       data_dims,
+                       strides,
+                       kernel_size_);
+  if (data_dims.size() * 2 == static_cast<int>(paddings_.size())) {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      paddings_.erase(paddings_.begin() + i + 1);
+    }
+  }
+
+  if (global_pooling) {
+    funcs::UpdateKernelSize(&kernel_size_, data_dims);
+  }
+
+  // ------- tensor grad --------------
+  DenseTensor transformed_input(input->type());
+  DenseTensor transformed_output(output->type());
+  DenseTensor transformed_output_grad(output_grad->type());
+
+  ctx.template Alloc<T>(input_grad);
+  DenseTensor transformed_input_grad(input_grad->type());
+  GPUDNNDataLayout layout;
+  const std::string str_NCHW = "NCHW", str_NHWC = "NHWC";
+  const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC";
+  if (data_format == str_NDHWC) {
+    layout = GPUDNNDataLayout::kNCDHW;
+    std::vector<int> axis{0, 4, 1, 2, 3};
+
+    // input
+    transformed_input.Resize(input->dims());
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[4];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    in_dims_vec[4] = input->dims()[3];
+    transformed_input.Resize(make_ddim(in_dims_vec));
+    ctx.Alloc(&transformed_input, input->type());
+
+    funcs::Transpose<Context, T, 5> trans5;
+    trans5(ctx, *input, &transformed_input, axis);
+
+    // output
+    transformed_output.Resize(output->dims());
+    auto out_dims_vec = vectorize(output->dims());
+    out_dims_vec[1] = output->dims()[4];
+    out_dims_vec[2] = output->dims()[1];
+    out_dims_vec[3] = output->dims()[2];
+    out_dims_vec[4] = output->dims()[3];
+    transformed_output.Resize(make_ddim(out_dims_vec));
+
+    ctx.Alloc(&transformed_output, output->type());
+
+    funcs::Transpose<Context, T, 5> trans5_v2;
+    trans5_v2(ctx, *output, &transformed_output, axis);
+
+    // output grad
+    transformed_output_grad.Resize(make_ddim(out_dims_vec));
+    ctx.Alloc(&transformed_output_grad, output_grad->type());
+
+    funcs::Transpose<Context, T, 5> trans5_v3;
+    trans5_v3(ctx, *output_grad, &transformed_output_grad, axis);
+
+    // input grad
+    transformed_input_grad.Resize(make_ddim(in_dims_vec));
+
+#ifdef PADDLE_WITH_HIP
+    // MIOPEN not support NHWC data layout
+  } else if (data_format == str_NHWC) {
+    layout = GPUDNNDataLayout::kNCHW;
+
+    std::vector<int> axis{0, 3, 1, 2};
+
+    // input
+    transformed_input.Resize(input->dims());
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[3];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    transformed_input.Resize(make_ddim(in_dims_vec));
+    ctx.Alloc(&transformed_input, input->type());
+
+    funcs::Transpose<Context, T, 4> trans4;
+    trans4(ctx, *input, &transformed_input, axis);
+
+    // output
+    transformed_output.Resize(output->dims());
+    auto out_dims_vec = vectorize(output->dims());
+    out_dims_vec[1] = output->dims()[3];
+    out_dims_vec[2] = output->dims()[1];
+    out_dims_vec[3] = output->dims()[2];
+    transformed_output.Resize(make_ddim(out_dims_vec));
+    ctx.Alloc(&transformed_output, output->type());
+
+    funcs::Transpose<Context, T, 4> trans4_v2;
+    trans4_v2(ctx, *output, &transformed_output, axis);
+
+    // output grad
+    transformed_output_grad.Resize(make_ddim(out_dims_vec));
+    ctx.Alloc(&transformed_output_grad, output_grad->type());
+
+    funcs::Transpose<Context, T, 4> trans4_v3;
+    trans4_v3(ctx, *output_grad, &transformed_output_grad, axis);
+
+    // input grad
+    transformed_input_grad.Resize(make_ddim(in_dims_vec));
+#endif
+  } else {
+    layout = GetLayoutFromStr(data_format);
+    transformed_input = *input;
+    transformed_output = *output;
+    transformed_output_grad = *output_grad;
+    transformed_input_grad = *input_grad;
+  }
+
+  const T* input_data = transformed_input.data<T>();
+  const T* output_data = transformed_output.data<T>();
+  const T* output_grad_data = transformed_output_grad.data<T>();
+
+  // ------------------- cudnn descriptors ---------------------
+  ScopedTensorDescriptor input_desc;
+  ScopedTensorDescriptor output_desc;
+  ScopedPoolingDescriptor pool_desc;
+
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_input.dims()));
+  miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_output.dims()));
+#else
+  cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_input.dims()));
+  cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_output.dims()));
+#endif
+  PoolingMode pooling_mode;
+  if (pooling_type == "max") {
+    if (FLAGS_cudnn_deterministic) {
+      pooling_mode = PoolingMode::kMaximumDeterministic;
+    } else {
+      pooling_mode = PoolingMode::kMaximum;
+    }
+  } else {
+    pooling_mode = exclusive ? PoolingMode::kAverageExclusive
+                             : PoolingMode::kAverageInclusive;
+  }
+
+#ifdef PADDLE_WITH_HIP
+  miopenPoolingDescriptor_t cudnn_pool_desc =
+      pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
+#else
+  cudnnPoolingDescriptor_t cudnn_pool_desc =
+      pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
+#endif
+
+  // ------------------- cudnn pool algorithm ---------------------
+  auto handle = ctx.cudnn_handle();
+  ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+  if (input_grad) {
+    T* input_grad_data = ctx.template Alloc<T>(&transformed_input_grad);
+// Because beta is zero, it is unnecessary to reset input_grad.
+#ifdef PADDLE_WITH_HIP
+    char* pool_workspace;
+    size_t pool_worksize = 0;
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenPoolingGetWorkSpaceSizeV2(
+        cudnn_pool_desc, cudnn_output_desc, &pool_worksize));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenPoolingBackward(handle,
+                                                              cudnn_pool_desc,
+                                                              &alpha,
+                                                              cudnn_output_desc,
+                                                              output_data,
+                                                              cudnn_output_desc,
+                                                              output_grad_data,
+                                                              cudnn_input_desc,
+                                                              input_data,
+                                                              &beta,
+                                                              cudnn_input_desc,
+                                                              input_grad_data,
+                                                              pool_workspace));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnPoolingBackward(handle,
+                                                             cudnn_pool_desc,
+                                                             &alpha,
+                                                             cudnn_output_desc,
+                                                             output_data,
+                                                             cudnn_output_desc,
+                                                             output_grad_data,
+                                                             cudnn_input_desc,
+                                                             input_data,
+                                                             &beta,
+                                                             cudnn_input_desc,
+                                                             input_grad_data));
+#endif
+
+    if (data_format == str_NDHWC) {
+      std::vector<int> axis{0, 2, 3, 4, 1};
+      funcs::Transpose<Context, T, 5> trans5_v4;
+      trans5_v4(ctx, transformed_input_grad, input_grad, axis);
+    }
+#ifdef PADDLE_WITH_HIP
+    // MIOPEN not support NHWC data layout
+    if (data_format == str_NHWC) {
+      std::vector<int> axis{0, 2, 3, 1};
+      funcs::Transpose<Context, T, 4> trans4_v4;
+      trans4_v4(ctx, transformed_input_grad, input_grad, axis);
+    }
+#endif
+  }
+}
+
+template <typename T, typename Context>
+void Pool2dGradGPUDNNKernel(const Context& ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& out,
+                            const DenseTensor& dout,
+                            const std::vector<int>& kernel_size,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            bool ceil_mode,
+                            bool exclusive,
+                            const std::string& data_format,
+                            const std::string& pooling_type,
+                            bool global_pooling,
+                            bool adaptive,
+                            const std::string& padding_algorithm,
+                            DenseTensor* dx) {
+  PoolGradRawGPUDNNKernel<T, Context>(ctx,
+                                      x,
+                                      out,
+                                      dout,
+                                      kernel_size,
+                                      strides,
+                                      paddings,
+                                      exclusive,
+                                      data_format,
+                                      pooling_type,
+                                      global_pooling,
+                                      adaptive,
+                                      padding_algorithm,
+                                      dx);
+}
+
+template <typename T, typename Context>
+void Pool2dDoubleGradGPUDNNKernel(const Context& ctx,
+                                  const DenseTensor& x,
+                                  const std::vector<int>& kernel_size,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  bool ceil_mode,
+                                  bool exclusive,
+                                  const std::string& data_format,
+                                  const std::string& pooling_type,
+                                  bool global_pooling,
+                                  bool adaptive,
+                                  const std::string& padding_algorithm,
+                                  DenseTensor* out) {
+  if (pooling_type == "max") {
+    PADDLE_THROW(
+        errors::InvalidArgument("Pool op grad grad only supports avgpool."));
+  } else {
+    Pool2dGPUDNNKernel<T, Context>(ctx,
+                                   x,
+                                   kernel_size,
+                                   strides,
+                                   paddings,
+                                   ceil_mode,
+                                   exclusive,
+                                   data_format,
+                                   pooling_type,
+                                   global_pooling,
+                                   adaptive,
+                                   padding_algorithm,
+                                   out);
+  }
+}
+
+template <typename T, typename Context>
+void Pool3dGradGPUDNNKernel(const Context& ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& out,
+                            const DenseTensor& dout,
+                            const std::vector<int>& kernel_size,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            bool ceil_mode,
+                            bool exclusive,
+                            const std::string& data_format,
+                            const std::string& pooling_type,
+                            bool global_pooling,
+                            bool adaptive,
+                            const std::string& padding_algorithm,
+                            DenseTensor* dx) {
+  PoolGradRawGPUDNNKernel<T, Context>(ctx,
+                                      x,
+                                      out,
+                                      dout,
+                                      kernel_size,
+                                      strides,
+                                      paddings,
+                                      exclusive,
+                                      data_format,
+                                      pooling_type,
+                                      global_pooling,
+                                      adaptive,
+                                      padding_algorithm,
+                                      dx);
+}
+
+}  // namespace phi
+
+using phi::dtype::float16;
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(pool2d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool2dGradGPUDNNKernel,
+                   float,
+                   float16) {}
+PD_REGISTER_KERNEL(pool2d_double_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool2dDoubleGradGPUDNNKernel,
+                   float,
+                   float16) {}
+PD_REGISTER_KERNEL(pool3d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool3dGradGPUDNNKernel,
+                   float,
+                   float16) {}
+#else
+PD_REGISTER_KERNEL(pool2d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool2dGradGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+PD_REGISTER_KERNEL(pool2d_double_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool2dDoubleGradGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+PD_REGISTER_KERNEL(pool3d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool3dGradGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpudnn/pool_kernel.cu b/paddle/phi/kernels/gpudnn/pool_kernel.cu
new file mode 100644
index 0000000000000..d8f965667758b
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/pool_kernel.cu
@@ -0,0 +1,312 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/pool_kernel.h"
+
+#include "paddle/phi/kernels/gpudnn/pool_gpudnn.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PoolRawGPUDNNKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const std::vector<int>& kernel_size,
+                         const std::vector<int>& strides,
+                         const std::vector<int>& paddings,
+                         bool exclusive,
+                         const std::string& data_format,
+                         const std::string& pooling_type,
+                         bool global_pooling,
+                         bool adaptive,
+                         const std::string& padding_algorithm,
+                         DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_gpu_place(ctx.GetPlace()),
+      true,
+      errors::InvalidArgument("Pool operator CUDA kernel must use CUDAPlace "
+                              "rather than CPUPlace."));
+
+  const DenseTensor* input = &x;
+  DenseTensor* output = out;
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  ctx.template Alloc<T>(output);
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  // update paddings_
+  auto x_dims = input->dims();
+  DDim data_dims;
+  if (channel_last) {
+    data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  } else {
+    data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  }
+  funcs::UpdatePadding(&paddings_,
+                       global_pooling,
+                       adaptive,
+                       padding_algorithm,
+                       data_dims,
+                       strides,
+                       kernel_size_);
+  if (data_dims.size() * 2 == static_cast<int>(paddings_.size())) {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      paddings_.erase(paddings_.begin() + i + 1);
+    }
+  }
+
+  if (global_pooling) {
+    funcs::UpdateKernelSize(&kernel_size_, data_dims);
+  }
+
+  const std::string str_NCHW = "NCHW", str_NHWC = "NHWC";
+  const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC";
+
+  // -----------------transformed tensor ------------------------
+
+  DenseTensor transformed_input(input->type());
+  DenseTensor transformed_output(output->type());
+  GPUDNNDataLayout layout;
+
+  if (data_format == str_NDHWC) {
+    layout = GPUDNNDataLayout::kNCDHW;
+    std::vector<int> axis{0, 4, 1, 2, 3};
+
+    // input
+    transformed_input.Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[4];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    in_dims_vec[4] = input->dims()[3];
+    transformed_input.Resize(make_ddim(in_dims_vec));
+    ctx.Alloc(&transformed_input, input->type());
+
+    funcs::Transpose<Context, T, 5> trans5;
+    trans5(ctx, *input, &transformed_input, axis);
+
+    // output
+    transformed_output.Resize(output->dims());
+
+    auto out_dims_vec = vectorize(output->dims());
+    out_dims_vec[1] = output->dims()[4];
+    out_dims_vec[2] = output->dims()[1];
+    out_dims_vec[3] = output->dims()[2];
+    out_dims_vec[4] = output->dims()[3];
+    transformed_output.Resize(make_ddim(out_dims_vec));
+#ifdef PADDLE_WITH_HIP
+    // MIOPEN not support NHWC data layout
+  } else if (data_format == str_NHWC) {
+    layout = GPUDNNDataLayout::kNCHW;
+
+    std::vector<int> axis{0, 3, 1, 2};
+
+    transformed_input.Resize(input->dims());
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[3];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    transformed_input.Resize(make_ddim(in_dims_vec));
+    ctx.Alloc(&transformed_input, input->type());
+
+    funcs::Transpose<Context, T, 4> trans;
+    trans(ctx, *input, &transformed_input, axis);
+
+    transformed_output.Resize(output->dims());
+    auto out_dims_vec = vectorize(output->dims());
+    out_dims_vec[1] = output->dims()[3];
+    out_dims_vec[2] = output->dims()[1];
+    out_dims_vec[3] = output->dims()[2];
+    transformed_output.Resize(make_ddim(out_dims_vec));
+#endif
+  } else {
+    layout = GetLayoutFromStr(data_format);
+    transformed_input = *input;
+    transformed_output = *output;
+  }
+
+  const T* tranformed_input_data = transformed_input.data<T>();
+  T* tranformed_output_data = ctx.template Alloc<T>(&transformed_output);
+
+  // ------------------- cudnn descriptors ---------------------
+  ScopedTensorDescriptor input_desc;
+  ScopedTensorDescriptor output_desc;
+  ScopedPoolingDescriptor pool_desc;
+
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_input.dims()));
+  miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_output.dims()));
+#else
+  cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_input.dims()));
+  cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_output.dims()));
+#endif
+  PoolingMode pooling_mode;
+  if (pooling_type == "max") {
+    pooling_mode = PoolingMode::kMaximum;
+  } else {
+    pooling_mode = exclusive ? PoolingMode::kAverageExclusive
+                             : PoolingMode::kAverageInclusive;
+  }
+
+#ifdef PADDLE_WITH_HIP
+  miopenPoolingDescriptor_t cudnn_pool_desc =
+      pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
+#else
+  cudnnPoolingDescriptor_t cudnn_pool_desc =
+      pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
+#endif
+
+  // ------------------- cudnn pool algorithm ---------------------
+  auto handle = ctx.cudnn_handle();
+  ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+
+#ifdef PADDLE_WITH_HIP
+  char* pool_workspace;
+  size_t pool_workernel_size_ = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenPoolingGetWorkSpaceSizeV2(
+      cudnn_pool_desc, cudnn_output_desc, &pool_workernel_size_));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_workernel_size_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::miopenPoolingForward(handle,
+                                    cudnn_pool_desc,
+                                    &alpha,
+                                    cudnn_input_desc,
+                                    tranformed_input_data,
+                                    &beta,
+                                    cudnn_output_desc,
+                                    tranformed_output_data,
+                                    false,
+                                    pool_workspace,
+                                    pool_workernel_size_));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cudnnPoolingForward(handle,
+                                   cudnn_pool_desc,
+                                   &alpha,
+                                   cudnn_input_desc,
+                                   tranformed_input_data,
+                                   &beta,
+                                   cudnn_output_desc,
+                                   tranformed_output_data));
+#endif
+  // add
+  if (data_format == str_NDHWC) {
+    std::vector<int> axis{0, 2, 3, 4, 1};
+    funcs::Transpose<Context, T, 5> trans5_v2;
+    trans5_v2(ctx, transformed_output, output, axis);
+  }
+#ifdef PADDLE_WITH_HIP
+  // MIOPEN not support NHWC data layout
+  if (data_format == str_NHWC) {
+    std::vector<int> axis{0, 2, 3, 1};
+    funcs::Transpose<Context, T, 4> trans;
+    trans(ctx, transformed_output, output, axis);
+  }
+#endif
+}
+
+template <typename T, typename Context>
+void Pool2dGPUDNNKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        const std::vector<int>& kernel_size,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        bool ceil_mode,
+                        bool exclusive,
+                        const std::string& data_format,
+                        const std::string& pooling_type,
+                        bool global_pooling,
+                        bool adaptive,
+                        const std::string& padding_algorithm,
+                        DenseTensor* out) {
+  PoolRawGPUDNNKernel<T, Context>(ctx,
+                                  x,
+                                  kernel_size,
+                                  strides,
+                                  paddings,
+                                  exclusive,
+                                  data_format,
+                                  pooling_type,
+                                  global_pooling,
+                                  adaptive,
+                                  padding_algorithm,
+                                  out);
+}
+
+template <typename T, typename Context>
+void Pool3dGPUDNNKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        const std::vector<int>& kernel_size,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        bool ceil_mode,
+                        bool exclusive,
+                        const std::string& data_format,
+                        const std::string& pooling_type,
+                        bool global_pooling,
+                        bool adaptive,
+                        const std::string& padding_algorithm,
+                        DenseTensor* out) {
+  PoolRawGPUDNNKernel<T, Context>(ctx,
+                                  x,
+                                  kernel_size,
+                                  strides,
+                                  paddings,
+                                  exclusive,
+                                  data_format,
+                                  pooling_type,
+                                  global_pooling,
+                                  adaptive,
+                                  padding_algorithm,
+                                  out);
+}
+
+}  // namespace phi
+
+using phi::dtype::float16;
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(
+    pool2d, GPUDNN, ALL_LAYOUT, phi::Pool2dGPUDNNKernel, float, float16) {}
+PD_REGISTER_KERNEL(
+    pool3d, GPUDNN, ALL_LAYOUT, phi::Pool3dGPUDNNKernel, float, float16) {}
+#else
+PD_REGISTER_KERNEL(pool2d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool2dGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+PD_REGISTER_KERNEL(pool3d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool3dGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index c9c549379bbce..77159bfc876da 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -79,7 +79,7 @@ class VecT2<phi::dtype::bfloat16> {
   using Type = int;
 };
 
-static inline int log2_ceil(int value) {
+static inline int Log2Ceil(int value) {
   int log2_value = 0;
   while ((1 << log2_value) < value) ++log2_value;
   return log2_value;
@@ -121,17 +121,10 @@ struct ReduceMaxFunctor {
 };
 
 template <typename Tx, typename Ty = Tx>
-struct ExpSubFunctor {
-  HOSTDEVICE inline ExpSubFunctor() { y = static_cast<Tx>(0.0f); }
-
-  HOSTDEVICE explicit inline ExpSubFunctor(Tx y) : y((Tx)(y)) {}
-
+struct ExpFunctor {
   HOSTDEVICE inline Ty operator()(const Tx& x) const {
-    return static_cast<Ty>(std::exp(x - y));
+    return static_cast<Ty>(std::exp(x));
   }
-
- private:
-  Tx y;
 };
 
 template <typename Tx, typename Ty = Tx>
@@ -293,10 +286,14 @@ __global__ void WarpSoftmaxForward(T* softmax,
   }
 
   // data src
-  AccT srcdata[kBatchSize][kLoopsV][kVSize];
-  T src_tmp[kBatchSize][kLoopsV][kVSize];
-  kps::Init<AccT, kStep>(&srcdata[0][0][0], kLowInf);
-  kps::Init<T, kStep>(&src_tmp[0][0][0], -std::numeric_limits<T>::infinity());
+  // src_data: the raw data form global memory
+  // sub_data: store the data obtained by (src_data - max), used by log_softmax
+  // exp_data: store the data obtained by (exp(sub_data)), used by softmax
+  T src_data[kBatchSize][kLoopsV][kVSize];
+  AccT sub_data[kBatchSize][kLoopsV][kVSize];
+  AccT exp_data[kBatchSize][kLoopsV][kVSize];
+  kps::Init<AccT, kStep>(&sub_data[0][0][0], kLowInf);
+  kps::Init<T, kStep>(&src_data[0][0][0], -std::numeric_limits<T>::infinity());
 
   // data dst
   T out_tmp[kBatchSize][kLoopsV][kVSize];
@@ -313,11 +310,11 @@ __global__ void WarpSoftmaxForward(T* softmax,
   for (int i = 0; i < kBatchSize; ++i) {
     const VecT* src_v =
         reinterpret_cast<const VecT*>(&src[(first_batch + i) * stride]);
-    VecT* reg_v = reinterpret_cast<VecT*>(&src_tmp[i][0][0]);
+    VecT* reg_v = reinterpret_cast<VecT*>(&src_data[i][0][0]);
     kps::ReadData<VecT, VecT, kLoopsV, 1, 1, true>(
         &reg_v[0], &src_v[0], idx_max_v[i], 0, kWarpSize, 1);
     kps::ElementwiseUnary<T, AccT, kVItem, 1, 1, DataTransFunctor<T, AccT>>(
-        &srcdata[i][0][0], &src_tmp[i][0][0], DataTransFunctor<T, AccT>());
+        &sub_data[i][0][0], &src_data[i][0][0], DataTransFunctor<T, AccT>());
   }
 
   // compute max
@@ -327,14 +324,16 @@ __global__ void WarpSoftmaxForward(T* softmax,
               1,
               ReduceMaxFunctor<AccT>,
               kMode::kLocalMode>(
-      &max[0], &srcdata[0][0][0], ReduceMaxFunctor<AccT>(), true);
+      &max[0], &sub_data[0][0][0], ReduceMaxFunctor<AccT>(), true);
   WarpReduceMax<AccT, kBatchSize, kWarpSize>(max);
 
 // compute sum
 #pragma unroll
   for (int i = 0; i < kBatchSize; ++i) {
-    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpSubFunctor<AccT>>(
-        &srcdata[i][0][0], &srcdata[i][0][0], ExpSubFunctor<AccT>(max[i]));
+    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnarySubFunctor<AccT>>(
+        &sub_data[i][0][0], &sub_data[i][0][0], UnarySubFunctor<AccT>(max[i]));
+    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpFunctor<AccT>>(
+        &exp_data[i][0][0], &sub_data[i][0][0], ExpFunctor<AccT>());
   }
   kps::Reduce<AccT,
               kVItem,
@@ -342,7 +341,7 @@ __global__ void WarpSoftmaxForward(T* softmax,
               1,
               kps::AddFunctor<AccT>,
               kMode::kLocalMode>(
-      &sum[0], &srcdata[0][0][0], kps::AddFunctor<AccT>(), true);
+      &sum[0], &exp_data[0][0][0], kps::AddFunctor<AccT>(), true);
   WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
 // write data to global memory
@@ -351,8 +350,15 @@ __global__ void WarpSoftmaxForward(T* softmax,
     VecT* softmax_v =
         reinterpret_cast<VecT*>(&softmax[(first_batch + i) * stride]);
     VecT* reg_v = reinterpret_cast<VecT*>(&out_tmp[i][0][0]);
-    kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnaryDivFunctor<AccT>>(
-        &out_tmp[i][0][0], &srcdata[i][0][0], UnaryDivFunctor<AccT>(sum[i]));
+    if (LogMode) {
+      kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnarySubFunctor<AccT>>(
+          &out_tmp[i][0][0],
+          &sub_data[i][0][0],
+          UnarySubFunctor<AccT>(std::log(sum[i])));
+    } else {
+      kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnaryDivFunctor<AccT>>(
+          &out_tmp[i][0][0], &exp_data[i][0][0], UnaryDivFunctor<AccT>(sum[i]));
+    }
     kps::WriteData<VecT, VecT, kLoopsV, 1, 1, true>(
         &softmax_v[0], &reg_v[0], idx_max_v[i], 0, kWarpSize, 1);
   }
@@ -434,15 +440,25 @@ __global__ void WarpSoftmaxBackward(T* dst,
   AccT sum_tmp[kBatchSize][kLoopsV][kVSize];
   AccT* gradptr = reinterpret_cast<AccT*>(&grad_tmp[0][0][0]);
   AccT* srcptr = reinterpret_cast<AccT*>(&src_tmp[0][0][0]);
-  kps::ElementwiseBinary<AccT, AccT, kStep, 1, 1, kps::MulFunctor<AccT>>(
-      &sum_tmp[0][0][0], &gradptr[0], &srcptr[0], kps::MulFunctor<AccT>());
-  kps::Reduce<AccT,
-              kVItem,
-              kBatchSize,
-              1,
-              kps::AddFunctor<AccT>,
-              kps::details::ReduceMode::kLocalMode>(
-      &sum[0], &sum_tmp[0][0][0], kps::AddFunctor<AccT>(), true);
+  if (LogMode) {
+    kps::Reduce<AccT,
+                kVItem,
+                kBatchSize,
+                1,
+                kps::AddFunctor<AccT>,
+                kps::details::ReduceMode::kLocalMode>(
+        &sum[0], &grad_tmp[0][0][0], kps::AddFunctor<AccT>(), true);
+  } else {
+    kps::ElementwiseBinary<AccT, AccT, kStep, 1, 1, kps::MulFunctor<AccT>>(
+        &sum_tmp[0][0][0], &gradptr[0], &srcptr[0], kps::MulFunctor<AccT>());
+    kps::Reduce<AccT,
+                kVItem,
+                kBatchSize,
+                1,
+                kps::AddFunctor<AccT>,
+                kps::details::ReduceMode::kLocalMode>(
+        &sum[0], &sum_tmp[0][0][0], kps::AddFunctor<AccT>(), true);
+  }
   WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
   // write result to global memory
@@ -453,10 +469,23 @@ __global__ void WarpSoftmaxBackward(T* dst,
     if (i >= local_batches) break;
     AccT* gradptr = reinterpret_cast<AccT*>(&grad_tmp[i][0][0]);
     AccT* srcptr = reinterpret_cast<AccT*>(&src_tmp[i][0][0]);
-    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnarySubFunctor<AccT>>(
-        &out[i][0][0], &gradptr[0], UnarySubFunctor<AccT>(sum[i]));
-    kps::ElementwiseBinary<AccT, T, kVItem, 1, 1, kps::MulFunctor<AccT>>(
-        &out_tmp[i][0][0], &srcptr[0], &out[i][0][0], kps::MulFunctor<AccT>());
+    if (LogMode) {
+      kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpMulFunctor<AccT>>(
+          &out[i][0][0], &srcptr[0], ExpMulFunctor<AccT>(sum[i]));
+      kps::ElementwiseBinary<AccT, T, kVItem, 1, 1, kps::SubFunctor<AccT>>(
+          &out_tmp[i][0][0],
+          &gradptr[0],
+          &out[i][0][0],
+          kps::SubFunctor<AccT>());
+    } else {
+      kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnarySubFunctor<AccT>>(
+          &out[i][0][0], &gradptr[0], UnarySubFunctor<AccT>(sum[i]));
+      kps::ElementwiseBinary<AccT, T, kVItem, 1, 1, kps::MulFunctor<AccT>>(
+          &out_tmp[i][0][0],
+          &srcptr[0],
+          &out[i][0][0],
+          kps::MulFunctor<AccT>());
+    }
     VecT* dst_v = reinterpret_cast<VecT*>(&dst[(first_batch + i) * stride]);
     VecT* reg_v = reinterpret_cast<VecT*>(&out_tmp[i][0][0]);
     kps::WriteData<VecT, VecT, kLoopsV, 1, 1, true>(
@@ -577,8 +606,8 @@ static void GetBlockDim(int mid_dim, int low_dim, dim3* block) {
 #else
   constexpr int max_num_threads = 1024;
 #endif
-  int block_x = 1 << log2_ceil(low_dim);
-  int block_y = 1 << log2_ceil(mid_dim);
+  int block_x = 1 << Log2Ceil(low_dim);
+  int block_y = 1 << Log2Ceil(mid_dim);
   block->x = std::min(block_x, 32);
   block->y = std::min(block_y, static_cast<int>(max_num_threads / block->x));
   block->x = std::min(block_x, static_cast<int>(max_num_threads / block->y));
@@ -639,7 +668,8 @@ __global__ void NormalSoftmaxForward(
 
 template <typename T,
           typename AccT,
-          template <typename, typename> class Functor>
+          template <typename, typename> class Functor,
+          bool LogMode>
 __global__ void NormalSoftmaxBackward(T* input_grad,
                                       const T* output_grad,
                                       const T* output,
@@ -656,10 +686,17 @@ __global__ void NormalSoftmaxBackward(T* input_grad,
 
       // 1. reduce sum
       AccT sum = 0;
-      for (int mid_id = threadIdx.y; mid_id < mid_dim; mid_id += blockDim.y) {
-        int data_offset = grad_offset + mid_id * mid_stride;
-        sum += static_cast<AccT>(output_grad[data_offset]) *
-               static_cast<AccT>(output[data_offset]);
+      if (LogMode) {
+        for (int mid_id = threadIdx.y; mid_id < mid_dim; mid_id += blockDim.y) {
+          int data_offset = grad_offset + mid_id * mid_stride;
+          sum += static_cast<AccT>(output_grad[data_offset]);
+        }
+      } else {
+        for (int mid_id = threadIdx.y; mid_id < mid_dim; mid_id += blockDim.y) {
+          int data_offset = grad_offset + mid_id * mid_stride;
+          sum += static_cast<AccT>(output_grad[data_offset]) *
+                 static_cast<AccT>(output[data_offset]);
+        }
       }
       if (blockDim.y > 1) {
         kps::Reduce<AccT, 1, 1, 1, kps::AddFunctor<AccT>, kMode::kGlobalMode>(
@@ -715,10 +752,10 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx,
   dim3 grid, block;
   GetLaunchConfig(high_dim, mid_dim, low_dim, &grid, &block);
   if (LogMode) {
-    NormalSoftmaxBackward<
-        T,
-        AccT,
-        LogSoftmaxBackwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
+    NormalSoftmaxBackward<T,
+                          AccT,
+                          LogSoftmaxBackwardFunctor,
+                          LogMode><<<grid, block, 0, dev_ctx.stream()>>>(
         input_grad_data,
         output_grad_data,
         output_data,
@@ -726,10 +763,10 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx,
         mid_dim,
         low_dim);
   } else {
-    NormalSoftmaxBackward<
-        T,
-        AccT,
-        SoftmaxBackwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
+    NormalSoftmaxBackward<T,
+                          AccT,
+                          SoftmaxBackwardFunctor,
+                          LogMode><<<grid, block, 0, dev_ctx.stream()>>>(
         input_grad_data,
         output_grad_data,
         output_data,
@@ -739,6 +776,157 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx,
   }
 }
 
+static std::vector<int> GetSoftmaxTensorDims(const phi::DDim& dims,
+                                             const int axis) {
+  int dim = dims[axis];
+  int N = phi::funcs::SizeToAxis(axis, dims);
+  int D = phi::funcs::SizeOutAxis(axis, dims);
+  return {N, dim, D, 1};
+}
+
+template <typename T>
+void SoftmaxForwardCudnnKernel(const GPUContext& dev_ctx,
+                               const DenseTensor& x,
+                               const int axis,
+                               const bool log_mode,
+                               DenseTensor* out) {
+  auto* out_data = out->data<T>();
+
+  const int rank = x.dims().size();
+  std::vector<int> tensor_dims = GetSoftmaxTensorDims(x.dims(), axis);
+
+  auto handle = dev_ctx.cudnn_handle();
+  GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
+
+  ScopedTensorDescriptor scoped_desc;
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t desc =
+      scoped_desc.descriptor<T>(layout, tensor_dims);
+  auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
+                               : MIOPEN_SOFTMAX_MODE_CHANNEL;
+  auto algo = log_mode ? MIOPEN_SOFTMAX_LOG : MIOPEN_SOFTMAX_ACCURATE;
+  PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenSoftmaxForward_V2(
+      handle,
+      paddle::platform::CudnnDataType<T>::kOne(),
+      desc,
+      x.data<T>(),
+      paddle::platform::CudnnDataType<T>::kZero(),
+      desc,
+      out_data,
+      algo,
+      mode));
+#else
+  cudnnTensorDescriptor_t desc = scoped_desc.descriptor<T>(layout, tensor_dims);
+  auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
+                               : CUDNN_SOFTMAX_MODE_CHANNEL;
+  auto algo = log_mode ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE;
+  PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward(
+      handle,
+      algo,
+      mode,
+      paddle::platform::CudnnDataType<T>::kOne(),
+      desc,
+      x.data<T>(),
+      paddle::platform::CudnnDataType<T>::kZero(),
+      desc,
+      out_data));
+#endif
+}
+
+template <typename T>
+void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx,
+                                const DenseTensor& out,
+                                const DenseTensor& dout,
+                                const int axis,
+                                const bool log_mode,
+                                DenseTensor* dx) {
+  auto* dx_data = dx->data<T>();
+
+  int rank = out.dims().size();
+  std::vector<int> tensor_dims = GetSoftmaxTensorDims(out.dims(), axis);
+
+  auto handle = dev_ctx.cudnn_handle();
+  GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
+
+  ScopedTensorDescriptor scoped_desc;
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t desc =
+      scoped_desc.descriptor<T>(layout, tensor_dims);
+  auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
+                               : MIOPEN_SOFTMAX_MODE_CHANNEL;
+  auto algo = log_mode ? MIOPEN_SOFTMAX_LOG : MIOPEN_SOFTMAX_ACCURATE;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenSoftmaxBackward_V2(
+          handle,
+          paddle::platform::CudnnDataType<T>::kOne(),
+          desc,
+          out.data<T>(),
+          desc,
+          dout.data<T>(),
+          paddle::platform::CudnnDataType<T>::kZero(),
+          desc,
+          dx_data,
+          algo,
+          mode));
+#else
+  cudnnTensorDescriptor_t desc = scoped_desc.descriptor<T>(layout, tensor_dims);
+  auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
+                               : CUDNN_SOFTMAX_MODE_CHANNEL;
+  auto algo = log_mode ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE;
+  PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxBackward(
+      handle,
+      algo,
+      mode,
+      paddle::platform::CudnnDataType<T>::kOne(),
+      desc,
+      out.data<T>(),
+      desc,
+      dout.data<T>(),
+      paddle::platform::CudnnDataType<T>::kZero(),
+      desc,
+      dx_data));
+#endif
+}
+
+template <typename T>
+static bool CanUseCudnnSoftmax(const GPUContext& dev_ctx) {
+  if (dev_ctx.cudnn_handle() != nullptr) {
+    if (std::is_same<T, phi::dtype::bfloat16>::value) {
+#if CUDNN_VERSION < 8100
+      return false;
+#endif
+    }
+    return true;
+  }
+  return false;
+}
+
+#if CUDNN_VERSION < 8100
+template <>
+inline void SoftmaxForwardCudnnKernel<phi::dtype::bfloat16>(
+    const GPUContext& dev_ctx,
+    const DenseTensor& x,
+    const int axis,
+    const bool log_mode,
+    DenseTensor* out) {
+  PADDLE_THROW(errors::Unavailable(
+      "This kernel is not supported when the dtype is bf16 and CUDNN_VERSION < "
+      "8100."));
+}
+template <>
+inline void SoftmaxBackwardCudnnKernel<phi::dtype::bfloat16>(
+    const GPUContext& dev_ctx,
+    const DenseTensor& out,
+    const DenseTensor& dout,
+    const int axis,
+    const bool log_mode,
+    DenseTensor* dx) {
+  PADDLE_THROW(errors::Unavailable(
+      "This kernel is not supported when the dtype is bf16 and CUDNN_VERSION < "
+      "8100."));
+}
+#endif
+
 template <typename T, bool LogMode = false>
 void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                     const DenseTensor& x,
@@ -746,29 +934,29 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                     DenseTensor* out) {
   auto* out_data = out->data<T>();
 
-  auto dims = x.dims();
-  const int rank = dims.size();
-  const int axis = phi::funcs::CanonicalAxis(input_axis, rank);
-  const int dim = dims[axis];
-  const int N = phi::funcs::SizeToAxis(axis, dims);
-  const int D = phi::funcs::SizeOutAxis(axis, dims);
+  int rank = x.dims().size();
+  int axis = phi::funcs::CanonicalAxis(input_axis, rank);
+  std::vector<int> tensor_dims = GetSoftmaxTensorDims(x.dims(), axis);
+  int N = tensor_dims[0];
+  int dim = tensor_dims[1];
+  int D = tensor_dims[2];
 
   constexpr int max_dim = 512;
-  constexpr int warps_per_block = 4;
 
-  if (D == 1 && dim <= max_dim && sizeof(T) <= 4) {
-    const int kDimLog2 = static_cast<int>(log2_ceil(dim));
-    const int kDimCeil = 1 << kDimLog2;
-    int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
-    int batches_per_warp = (kDimCeil <= 32) ? 2 : 1;
+  if (D == 1 &&
+      (!CanUseCudnnSoftmax<T>(dev_ctx) || (dim <= max_dim && sizeof(T) <= 4))) {
+    int dim_log2 = static_cast<int>(Log2Ceil(dim));
+    int dim_ceil = 1 << dim_log2;
+    int warp_size = (dim_ceil < 32) ? dim_ceil : 32;
+    int batches_per_warp = (dim_ceil <= 32) ? 2 : 1;
 
     // use 128 threads per block to maximimize gpu utilization
     constexpr int threads_per_block = 128;
 
-    int warps_per_block = (threads_per_block / kWarpSize);
+    int warps_per_block = (threads_per_block / warp_size);
     int batches_per_block = warps_per_block * batches_per_warp;
     int blocks = (N + batches_per_block - 1) / batches_per_block;
-    dim3 threads(kWarpSize, warps_per_block, 1);
+    dim3 threads(warp_size, warps_per_block, 1);
 
     // vectorization read/write
     using T4 = typename VecT4<T>::Type;
@@ -783,7 +971,7 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                                N,
                                                dim,
                                                dim,
-                                               kDimLog2);
+                                               dim_log2);
     } else if (dim % 2 == 0) {
       SwitchWarpSoftmaxForward<T, T2, LogMode>(blocks,
                                                threads,
@@ -793,7 +981,7 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                                N,
                                                dim,
                                                dim,
-                                               kDimLog2);
+                                               dim_log2);
     } else {
       SwitchWarpSoftmaxForward<T, T, LogMode>(blocks,
                                               threads,
@@ -803,78 +991,13 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                               N,
                                               dim,
                                               dim,
-                                              kDimLog2);
+                                              dim_log2);
     }
   } else if (D > 1) {
     LaunchNormalSoftmaxForward<T, LogMode>(
         dev_ctx, out_data, x.data<T>(), N, dim, D);
   } else {
-    ScopedTensorDescriptor desc;
-    std::vector<int> tensor_dims = {N, dim, D, 1};
-    GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
-#else
-    cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
-#endif
-
-    auto handle = dev_ctx.cudnn_handle();
-
-#ifdef PADDLE_WITH_HIP
-    auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
-                                 : MIOPEN_SOFTMAX_MODE_CHANNEL;
-    if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::miopenSoftmaxForward_V2(
-              handle,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              x.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              out_data,
-              MIOPEN_SOFTMAX_LOG,
-              mode));
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::miopenSoftmaxForward_V2(
-              handle,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              x.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              out_data,
-              MIOPEN_SOFTMAX_ACCURATE,
-              mode));
-    }
-#else
-    auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
-                                 : CUDNN_SOFTMAX_MODE_CHANNEL;
-    if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward(
-          handle,
-          CUDNN_SOFTMAX_LOG,
-          mode,
-          paddle::platform::CudnnDataType<T>::kOne(),
-          desc_,
-          x.data<T>(),
-          paddle::platform::CudnnDataType<T>::kZero(),
-          desc_,
-          out_data));
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward(
-          handle,
-          CUDNN_SOFTMAX_ACCURATE,
-          mode,
-          paddle::platform::CudnnDataType<T>::kOne(),
-          desc_,
-          x.data<T>(),
-          paddle::platform::CudnnDataType<T>::kZero(),
-          desc_,
-          out_data));
-    }
-#endif
+    SoftmaxForwardCudnnKernel<T>(dev_ctx, x, axis, LogMode, out);
   }
 }
 
@@ -886,27 +1009,28 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                      DenseTensor* dx) {
   auto* dx_data = dx->data<T>();
 
-  auto dims = out.dims();
-  const int rank = dims.size();
-  const int axis = phi::funcs::CanonicalAxis(input_axis, rank);
-  const int dim = dims[axis];
-  const int N = phi::funcs::SizeToAxis(axis, dims);
-  const int D = phi::funcs::SizeOutAxis(axis, dims);
+  int rank = out.dims().size();
+  int axis = phi::funcs::CanonicalAxis(input_axis, rank);
+  std::vector<int> tensor_dims = GetSoftmaxTensorDims(out.dims(), axis);
+  int N = tensor_dims[0];
+  int dim = tensor_dims[1];
+  int D = tensor_dims[2];
 
   constexpr int max_dim = 512;
-  constexpr int warps_per_block = 4;
 
-  if (D == 1 && dim <= max_dim && sizeof(T) <= 4) {
-    const int kDimLog2 = log2_ceil(dim);
-    const int kDimCeil = 1 << kDimLog2;
-    int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
-    int batches_per_warp = (kDimCeil <= 128) ? 2 : 1;
+  if (D == 1 &&
+      (!CanUseCudnnSoftmax<T>(dev_ctx) || (dim <= max_dim && sizeof(T) <= 4))) {
+    int dim_log2 = Log2Ceil(dim);
+    int dim_ceil = 1 << dim_log2;
+    int warp_size = (dim_ceil < 32) ? dim_ceil : 32;
+    int batches_per_warp = (dim_ceil <= 128) ? 2 : 1;
+
     constexpr int threads_per_block = 128;
 
-    int warps_per_block = (threads_per_block / kWarpSize);
+    int warps_per_block = (threads_per_block / warp_size);
     int batches_per_block = warps_per_block * batches_per_warp;
     int blocks = (N + batches_per_block - 1) / batches_per_block;
-    dim3 threads(kWarpSize, warps_per_block, 1);
+    dim3 threads(warp_size, warps_per_block, 1);
 
     // vectorization read/write
     using T4 = typename VecT4<T>::Type;
@@ -921,7 +1045,7 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                                 N,
                                                 dim,
                                                 dim,
-                                                kDimLog2);
+                                                dim_log2);
     } else if (dim % 2 == 0) {
       SwitchWarpSoftmaxBackward<T, T2, LogMode>(blocks,
                                                 threads,
@@ -932,7 +1056,7 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                                 N,
                                                 dim,
                                                 dim,
-                                                kDimLog2);
+                                                dim_log2);
     } else {
       SwitchWarpSoftmaxBackward<T, T, LogMode>(blocks,
                                                threads,
@@ -943,88 +1067,13 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                                N,
                                                dim,
                                                dim,
-                                               kDimLog2);
+                                               dim_log2);
     }
   } else if (D > 1) {
     LaunchNormalSoftmaxBackward<T, LogMode>(
         dev_ctx, dx_data, dout.data<T>(), out.data<T>(), N, dim, D);
   } else {
-    ScopedTensorDescriptor desc;
-    std::vector<int> tensor_dims = {N, dim, D, 1};
-    GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
-#else
-    cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
-#endif
-
-    auto handle = dev_ctx.cudnn_handle();
-
-#ifdef PADDLE_WITH_HIP
-    auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
-                                 : MIOPEN_SOFTMAX_MODE_CHANNEL;
-    if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::miopenSoftmaxBackward_V2(
-              handle,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              out.data<T>(),
-              desc_,
-              dout.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              dx_data,
-              MIOPEN_SOFTMAX_LOG,
-              mode));
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::miopenSoftmaxBackward_V2(
-              handle,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              out.data<T>(),
-              desc_,
-              dout.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              dx_data,
-              MIOPEN_SOFTMAX_ACCURATE,
-              mode));
-    }
-#else
-    auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
-                                 : CUDNN_SOFTMAX_MODE_CHANNEL;
-    if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cudnnSoftmaxBackward(
-              handle,
-              CUDNN_SOFTMAX_LOG,
-              mode,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              out.data<T>(),
-              desc_,
-              dout.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              dx_data));
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cudnnSoftmaxBackward(
-              handle,
-              CUDNN_SOFTMAX_ACCURATE,
-              mode,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              out.data<T>(),
-              desc_,
-              dout.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              dx_data));
-    }
-#endif
+    SoftmaxBackwardCudnnKernel<T>(dev_ctx, out, dout, axis, LogMode, dx);
   }
 }
 
diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel.cu b/paddle/phi/kernels/gpudnn/softmax_kernel.cu
index 7685c7dbb6894..37175c427ffe1 100644
--- a/paddle/phi/kernels/gpudnn/softmax_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_kernel.cu
@@ -21,10 +21,10 @@ limitations under the License. */
 namespace phi {
 
 template <typename T, typename Context>
-void SoftmaxRawGPUDNNKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            int axis,
-                            DenseTensor* out) {
+void SoftmaxGPUDNNKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         int axis,
+                         DenseTensor* out) {
   dev_ctx.template Alloc<T>(out);
   SoftmaxForwardCUDAKernelDriver<T>(dev_ctx, x, axis, out);
 }
@@ -35,7 +35,7 @@ void SoftmaxRawGPUDNNKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(softmax,
                    GPUDNN,
                    ALL_LAYOUT,
-                   phi::SoftmaxRawGPUDNNKernel,
+                   phi::SoftmaxGPUDNNKernel,
                    float,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
@@ -44,7 +44,7 @@ PD_REGISTER_KERNEL(softmax,
 PD_REGISTER_KERNEL(softmax,
                    GPUDNN,
                    ALL_LAYOUT,
-                   phi::SoftmaxRawGPUDNNKernel,
+                   phi::SoftmaxGPUDNNKernel,
                    float,
                    double,
                    phi::dtype::float16,
@@ -53,7 +53,7 @@ PD_REGISTER_KERNEL(softmax,
 PD_REGISTER_KERNEL(softmax,
                    GPUDNN,
                    ALL_LAYOUT,
-                   phi::SoftmaxRawGPUDNNKernel,
+                   phi::SoftmaxGPUDNNKernel,
                    float,
                    double,
                    phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/grid_sample_grad_kernel.h b/paddle/phi/kernels/grid_sample_grad_kernel.h
new file mode 100644
index 0000000000000..50a8d5be260bd
--- /dev/null
+++ b/paddle/phi/kernels/grid_sample_grad_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GridSampleGradKernel(const Context &dev_ctx,
+                          const DenseTensor &x,
+                          const DenseTensor &grid,
+                          const DenseTensor &out_grid,
+                          const std::string &mode,
+                          const std::string &padding_mode,
+                          bool align_corners,
+                          DenseTensor *x_grad,
+                          DenseTensor *grid_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/grid_sample_kernel.h b/paddle/phi/kernels/grid_sample_kernel.h
new file mode 100644
index 0000000000000..2e1e9b508649b
--- /dev/null
+++ b/paddle/phi/kernels/grid_sample_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GridSampleKernel(const Context &dev_ctx,
+                      const DenseTensor &x,
+                      const DenseTensor &grid,
+                      const std::string &mode,
+                      const std::string &padding_mode,
+                      bool align_corners,
+                      DenseTensor *out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h b/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h
new file mode 100644
index 0000000000000..f7a327cd3f566
--- /dev/null
+++ b/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void HierarchicalSigmoidGradKernel(const Context& ctx,
+                                   const DenseTensor& x,
+                                   const DenseTensor& w,
+                                   const DenseTensor& label,
+                                   const DenseTensor& pre_out,
+                                   const DenseTensor& out_grad,
+                                   paddle::optional<const DenseTensor&> path,
+                                   paddle::optional<const DenseTensor&> code,
+                                   paddle::optional<const DenseTensor&> bias,
+                                   int num_classes,
+                                   bool remote_prefetch,
+                                   int trainer_id,
+                                   const std::vector<int64_t>& height_sections,
+                                   const std::vector<std::string>& epmap,
+                                   const std::vector<std::string>& table_names,
+                                   bool is_sparse,
+                                   DenseTensor* x_grad,
+                                   DenseTensor* w_grad,
+                                   DenseTensor* bias_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/hierarchical_sigmoid_kernel.h b/paddle/phi/kernels/hierarchical_sigmoid_kernel.h
new file mode 100644
index 0000000000000..619b022904b17
--- /dev/null
+++ b/paddle/phi/kernels/hierarchical_sigmoid_kernel.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void HierarchicalSigmoidKernel(const Context& ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& w,
+                               const DenseTensor& label,
+                               paddle::optional<const DenseTensor&> path,
+                               paddle::optional<const DenseTensor&> code,
+                               paddle::optional<const DenseTensor&> bias,
+                               int num_classes,
+                               bool remote_prefetch,
+                               int trainer_id,
+                               const std::vector<int64_t>& height_sections,
+                               const std::vector<std::string>& epmap,
+                               const std::vector<std::string>& table_names,
+                               bool is_sparse,
+                               DenseTensor* out,
+                               DenseTensor* pre_out,
+                               DenseTensor* w_out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h
index 80e23d2b8e24b..7d6b6dc72ea60 100644
--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -130,4 +130,149 @@ void ReluDoubleGradKernel(const Context& dev_ctx,
       relu_double_grad_functor);
 }
 
+template <typename T, typename Context>
+void LeakyReluDoubleGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& ddx,
+                               float alpha,
+                               DenseTensor* ddout) {
+  funcs::LeakyReluGradGradFunctor<T> leaky_relu_double_grad_functor;
+  leaky_relu_double_grad_functor.alpha = alpha;
+  ActivationDoubleGradImpl<T, Context, funcs::LeakyReluGradGradFunctor<T>>(
+      dev_ctx,
+      &x,
+      nullptr,
+      &ddx,
+      nullptr,
+      nullptr,
+      ddout,
+      leaky_relu_double_grad_functor);
+}
+
+template <typename T, typename Context>
+void TanhDoubleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          const DenseTensor& dout,
+                          DenseTensor* dout_new,
+                          DenseTensor* ddout) {
+  if (dout_new) {
+    dout_new->Resize(out.dims());
+    dev_ctx.template Alloc<T>(dout_new);
+  }
+  if (ddout) {
+    ddout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(ddout);
+  }
+  funcs::TanhGradGradFunctor<T> functor;
+  functor(dev_ctx, &out, &ddx, &dout, dout_new, ddout);
+}
+
+template <typename T, typename Context>
+void TanhTripleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          const DenseTensor& dout,
+                          const DenseTensor& d_ddout,
+                          const DenseTensor& d_dout_new,
+                          DenseTensor* d_out_new,
+                          DenseTensor* d_dout,
+                          DenseTensor* d_ddx) {
+  if (d_dout) {
+    d_dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(d_dout);
+  }
+  if (d_out_new) {
+    d_dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(d_out_new);
+  }
+  if (d_ddx) {
+    d_dout->Resize(ddx.dims());
+    dev_ctx.template Alloc<T>(d_ddx);
+  }
+  funcs::TanhTripleGradFunctor<T> functor;
+  functor(dev_ctx,
+          &out,
+          &ddx,
+          &dout,
+          &d_ddout,
+          &d_dout_new,  // input
+          d_dout,
+          d_out_new,
+          d_ddx);  // output
+}
+
+template <typename T, typename Context>
+void EluDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& dout,
+                         const DenseTensor& ddx,
+                         float alpha,
+                         DenseTensor* dx,
+                         DenseTensor* ddout) {
+  if (dx) {
+    dx->Resize(x.dims());
+    dev_ctx.template Alloc<T>(dx);
+  }
+  if (ddout) {
+    dev_ctx.template Alloc<T>(ddout);
+  }
+  funcs::ELUGradGradFunctor<T> functor;
+  functor.alpha = alpha;
+  functor(dev_ctx, &x, &ddx, ddout, &dout, dx);
+}
+
+template <typename T, typename Context>
+void SigmoidDoubleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& ddx,
+                             const DenseTensor& dout,
+                             DenseTensor* dout_new,
+                             DenseTensor* ddout) {
+  if (dout_new) {
+    dout_new->Resize(out.dims());
+    dev_ctx.template Alloc<T>(dout_new);
+  }
+  if (ddout) {
+    ddout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(ddout);
+  }
+  funcs::SigmoidGradGradFunctor<T> functor;
+  functor(dev_ctx, &out, &ddx, &dout, dout_new, ddout);
+}
+
+template <typename T, typename Context>
+void SigmoidTripleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& ddx,
+                             const DenseTensor& dout,
+                             const DenseTensor& d_ddout,
+                             const DenseTensor& d_dout_new,
+                             DenseTensor* d_out_new,
+                             DenseTensor* d_dout,
+                             DenseTensor* d_ddx) {
+  if (d_dout) {
+    d_dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(d_dout);
+  }
+  if (d_out_new) {
+    d_dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(d_out_new);
+  }
+  if (d_ddx) {
+    d_dout->Resize(ddx.dims());
+    dev_ctx.template Alloc<T>(d_ddx);
+  }
+  funcs::SigmoidTripleGradFunctor<T> functor;
+  functor(dev_ctx,
+          &out,
+          &ddx,
+          &dout,
+          &d_ddout,
+          &d_dout_new,
+          d_dout,
+          d_out_new,
+          d_ddx);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
new file mode 100644
index 0000000000000..e3ea10705d24e
--- /dev/null
+++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
@@ -0,0 +1,133 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/cholesky_solve_grad_kernel.h"
+
+#include "paddle/phi/kernels/cholesky_solve_kernel.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/expand_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/matrix_reduce.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskySolveGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& y,
+                             const DenseTensor& out,
+                             const DenseTensor& dout,
+                             bool upper,
+                             DenseTensor* dx,
+                             DenseTensor* dy) {
+  // get broadcast dim
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
+      funcs::MatrixGetBroadcastDims(x, y);
+  ScalarArray x_bst_dims(x_bst_dims_vec);
+  ScalarArray y_bst_dims(y_bst_dims_vec);
+
+  // Tensor broadcast to temp 'y_bst'
+  DenseTensor y_bst = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
+  ExpandKernel<T, Context>(dev_ctx, y, y_bst_dims, &y_bst);
+
+  // reuse forward to calculate dx_bst, which is broad_cast of dx
+  DenseTensor dx_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims);
+  CholeskySolveKernel<T, Context>(dev_ctx, dout, y_bst, upper, &dx_bst);
+
+  // get 'dx' according to 'dx_bst'
+  dx->Resize(x.dims());
+  dev_ctx.template Alloc<T>(dx);
+  if (dx_bst.dims() == x.dims()) {
+    Copy<Context>(dev_ctx, dx_bst, dev_ctx.GetPlace(), false, dx);
+  } else {
+    funcs::MatrixReduceSumFunctor<T, Context> functor;
+    functor(dev_ctx, dx_bst, dx);
+    dx->Resize(x.dims());
+  }
+
+  // calculate out's conjugate for complex
+  DenseTensor out_conj = Conj<T, Context>(dev_ctx, out);
+  out_conj = phi::TransposeLast2Dim<T>(dev_ctx, out_conj);
+
+  DenseTensor commonterm = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  blas.MatMul(dx_bst,
+              phi::funcs::CreateMatrixDescriptor(dx_bst.dims(), 0, false),
+              out_conj,
+              phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false),
+              static_cast<T>(1),
+              &commonterm,
+              static_cast<T>(0));
+
+  // calculate commonterm's conjugate for complex
+  DenseTensor commonterm_conj = Conj<T, Context>(dev_ctx, commonterm);
+  commonterm_conj = phi::TransposeLast2Dim<T>(dev_ctx, commonterm_conj);
+
+  phi::AddRawKernel<T>(dev_ctx, commonterm, commonterm_conj, -1, &commonterm);
+
+  DenseTensor dy_bst = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
+  if (upper) {
+    blas.MatMul(y_bst,
+                phi::funcs::CreateMatrixDescriptor(y_bst.dims(), 0, false),
+                commonterm,
+                phi::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false),
+                static_cast<T>(-1),
+                &dy_bst,
+                static_cast<T>(0));
+  } else {
+    blas.MatMul(commonterm,
+                phi::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false),
+                y_bst,
+                phi::funcs::CreateMatrixDescriptor(y_bst.dims(), 0, false),
+                static_cast<T>(-1),
+                &dy_bst,
+                static_cast<T>(0));
+  }
+
+  // get upper or lower of 'dy_bst'
+  DenseTensor dy_bst_upper = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
+
+  int y_bst_ndim = y_bst_dims_vec.size();
+  const auto H = y_bst_dims_vec[y_bst_ndim - 2];
+  const auto W = y_bst_dims_vec[y_bst_ndim - 1];
+  phi::funcs::ForRange<Context> y_for_range(dev_ctx, dy_bst.numel());
+  phi::funcs::TrilTriuCompute<T> tril_triu_functor(
+      dy_bst.data<T>(), 0, !upper, H, W, dy_bst_upper.data<T>());
+  y_for_range(tril_triu_functor);
+
+  // get 'dy' according to 'dy_bst'
+  dy->Resize(y.dims());
+  dev_ctx.template Alloc<T>(dy);
+  if (dy_bst_upper.dims() == y.dims()) {
+    Copy<Context>(dev_ctx, dy_bst_upper, dev_ctx.GetPlace(), false, dy);
+  } else {
+    funcs::MatrixReduceSumFunctor<T, Context> functor;
+    functor(dev_ctx, dy_bst_upper, dy);
+    dy->Resize(y.dims());
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h
new file mode 100644
index 0000000000000..16ceb776f1a98
--- /dev/null
+++ b/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/cholesky_solve_kernel.h"
+
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/expand_kernel.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+class CholeskySolveFunctor {
+ public:
+  void operator()(const Context& dev_ctx,
+                  bool upper,
+                  int M,
+                  int N,
+                  T* Adata,
+                  int lda,
+                  T* Bdata,
+                  int* devInfo);
+};
+
+template <typename T, typename Context>
+void CholeskySolveKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& y,
+                         bool upper,
+                         DenseTensor* out) {
+  // get broadcast dim
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
+      funcs::MatrixGetBroadcastDims(x, y);
+  ScalarArray x_bst_dims(x_bst_dims_vec);
+  ScalarArray y_bst_dims(y_bst_dims_vec);
+
+  DenseTensor y_bst = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
+  ExpandKernel<T, Context>(dev_ctx, y, y_bst_dims, &y_bst);
+
+  // Tensor broadcast to temp 'x_bst' and 'y_bst'
+  DenseTensor x_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims);
+  ExpandKernel<T, Context>(dev_ctx, x, x_bst_dims, &x_bst);
+
+  // calculate y_bst's conjugate for complex
+  DenseTensor y_bst_conj = Conj<T, Context>(dev_ctx, y_bst);
+  y_bst_conj = phi::TransposeLast2Dim<T>(dev_ctx, y_bst_conj);
+  T* y_bst_conj_data = y_bst_conj.data<T>();
+
+  // calculate x_bst's conjugate for complex
+  DenseTensor x_bst_conj = Conj<T, Context>(dev_ctx, x_bst);
+  x_bst_conj = phi::TransposeLast2Dim<T>(dev_ctx, x_bst_conj);
+
+  // copy x_bst's conjugate to 'result'
+  DenseTensor result;
+  Copy<Context>(dev_ctx, x_bst_conj, dev_ctx.GetPlace(), false, &result);
+  T* res_data = result.data<T>();
+
+  // CPU use lapack, GPU use cusolver
+  int x_bst_ndim = x_bst_dims_vec.size();
+  int M = static_cast<int>(x_bst_dims_vec[x_bst_ndim - 2]);
+  int N = static_cast<int>(x_bst_dims_vec[x_bst_ndim - 1]);
+  int batchsize = product(phi::slice_ddim(x_bst.dims(), 0, x_bst_ndim - 2));
+
+  DenseTensor info =
+      phi::Empty<int, Context>(dev_ctx, ScalarArray({batchsize}));
+  int* info_data = info.data<int>();
+
+  CholeskySolveFunctor<T, Context> functor;
+  for (int i = 0; i < batchsize; ++i) {
+    functor(dev_ctx,
+            upper,
+            M,
+            N,
+            y_bst_conj_data + i * M * M,
+            std::max(1, M),
+            res_data + i * M * N,
+            info_data + i);
+  }
+
+  // calculate out's conjugate for complex
+  result = phi::TransposeLast2Dim<T>(dev_ctx, result);
+  out->Resize(phi::make_ddim(x_bst_dims_vec));
+  ConjKernel<T, Context>(dev_ctx, result, out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
new file mode 100644
index 0000000000000..d4fd952a67001
--- /dev/null
+++ b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
@@ -0,0 +1,364 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
+
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/slice.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvTransposeGradRawKernel(const Context& ctx,
+                                const DenseTensor& x,
+                                const DenseTensor& filter,
+                                const DenseTensor& dout,
+                                const std::vector<int>& strides,
+                                const std::vector<int>& paddings,
+                                const std::string& padding_algorithm,
+                                int groups,
+                                const std::vector<int>& dilations,
+                                const std::string& data_format,
+                                DenseTensor* dx,
+                                DenseTensor* dfilter) {
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_format);
+  // For filter, we do not use const pointer because we will do reshape,
+  // but we should avoid modifying its value.
+  DenseTensor filter_ = filter;
+
+  if ((!dx) && (!dfilter)) {
+    return;
+  }
+
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ = dilations;
+
+  auto x_dims = x.dims();
+  auto filter_dims = filter_.dims();
+  auto dout_dims = dout.dims();
+  const int batch_size = static_cast<int>(x.dims()[0]);
+
+  DDim in_data_dims;
+  if (data_layout != DataLayout::kNHWC) {
+    in_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  } else {
+    in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  }
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
+
+  // x_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first
+  // x_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last
+  std::vector<int64_t> x_shape_vec = vectorize(x.dims());
+  // filter_shape_vec: {i_c, o_c, k_h, k_w} or {i_c, o_c, k_d, k_h, k_w}
+  std::vector<int64_t> filter_shape_vec = vectorize(filter_.dims());
+
+  // use col_shape in the im2col and col2im (or vol2col and col2vol)
+  // calculation
+  // col_shape_vec: {o_c, k_h, k_w, h, w} or {o_c, k_d, k_h, k_w, d, h, w} for
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  if (data_layout != DataLayout::kNHWC) {
+    col_shape_vec[0] = dout_dims[1];
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 2];
+    }
+  } else {
+    col_shape_vec[0] = dout_dims[dout_dims.size() - 1];
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 1];
+    }
+  }
+  DDim col_shape(make_ddim(col_shape_vec));
+
+  // use col_matrix_shape in the gemm calculation
+  // size: (o_c * k_h * k_w, h * w) or (o_c * k_d * k_h * k_w, d * h * w)
+  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
+
+  // output size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first
+  // output size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last
+  DDim output_shape = slice_ddim(dout.dims(), 1, dout.dims().size());
+
+  // x matrix size: (i_c, h * w) or (i_c, d * h * w) for channel_first
+  // x matrix size: (h * w, i_c) or (d * h * w, i_c) for channel_last
+  DDim x_matrix_shape;
+  if (data_layout != DataLayout::kNHWC) {
+    x_matrix_shape = {x_dims[1], col_matrix_shape[1]};
+  } else {
+    x_matrix_shape = {col_matrix_shape[1], x_dims[x_dims.size() - 1]};
+  }
+
+  // filter size: (i_c, o_c/g * k_h * k_w) or (i_c, o_c/g * k_d * k_h * k_w)
+  DDim filter_matrix_shape;
+  if (data_layout != DataLayout::kNHWC) {
+    filter_matrix_shape = {x_dims[1], col_matrix_shape[0] / groups};
+  } else {
+    filter_matrix_shape = {x_dims[x_dims.size() - 1],
+                           col_matrix_shape[0] / groups};
+  }
+  filter_.Resize(filter_matrix_shape);
+
+  int in_step = (data_layout != DataLayout::kNHWC
+                     ? static_cast<int>(x_dims[1]) / groups
+                     : static_cast<int>(x_dims[x_dims.size() - 1]) / groups);
+  int col_step = static_cast<int>(col_matrix_shape[0]) / groups;
+
+  // convolution transpose grad on x:
+  // im2col + gemm (similar to conv-forward)
+  // x need to compute gradient
+  auto blas = funcs::GetBlas<Context, T>(ctx);
+  if (dx || dfilter) {
+    DenseTensor col;
+    col.Resize(col_shape);
+    ctx.template Alloc<T>(&col);
+    // col_matrix shares the same piece of data with col,
+    // but will be reshaped into a two-dimensional matrix shape
+    // to call the matrix multiplication interface.
+    DenseTensor col_matrix;
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+
+    DenseTensor dfilter_;
+    funcs::SetConstant<Context, T> set_zero;
+
+    paddle::operators::math::
+        Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            im2col;
+    paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
+    funcs::ConcatFunctor<Context, T> concat_functor;
+
+    if (dx) {
+      ctx.template Alloc<T>(dx);
+      set_zero(ctx, dx, static_cast<T>(0));
+    }
+    if (dfilter) {  // dfilter_ size (i_c, o_c/g, k_h, k_w)
+      ctx.template Alloc<T>(dfilter);
+      set_zero(ctx, dfilter, static_cast<T>(0));
+      dfilter_ = *dfilter;
+      dfilter_.Resize(filter_matrix_shape);
+    }
+
+    size_t D = x.dims().size();
+    for (int i = 0; i < batch_size; i++) {
+      // batch with size (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for
+      // channel_first
+      // batch with size (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for
+      // channel_last
+      DenseTensor dout_batch = dout.Slice(i, i + 1).Resize(output_shape);
+
+      if (data_dim == 2U) {
+        // im2col: dy -> col matrix
+        // from (o_c, o_h, o_w) to (o_c * k_h * k_w, i_h * i_w) for
+        // channel_first
+        // from (o_h, o_w, o_c) to (o_c * k_h * k_w, i_h * i_w) for
+        // channel_last
+        im2col(ctx,
+               dout_batch,
+               dilations_,
+               strides,
+               std::vector<int>{
+                   paddings_[0], paddings_[2], paddings_[1], paddings_[3]},
+               &col,
+               data_layout);
+      } else if (data_dim == 3U) {
+        // vol2col: dy -> col_matrix
+        // from (o_c, o_d, o_h, o_w) to (o_c * k_d * k_h * k_w, i_d * i_h *
+        // i_w) for channel_first
+        // from (o_d, o_h, o_w, o_c) to (i_d * i_h * i_w, o_c * k_d * k_h *
+        // k_w) for channel_last
+        vol2col(
+            ctx, dout_batch, dilations_, strides, paddings_, &col, data_layout);
+      }
+      if (dx) {
+        // batch with size (i_c, i_h, i_w) or (i_h, i_w, i_c)
+        DenseTensor dx_batch = dx->Slice(i, i + 1).Resize(x_matrix_shape);
+
+        // gemm: dx = filter * dy
+        // (i_c, o_c * k_h * k_w) * (o_c * k_h * k_w, i_h * i_w) -> (i_c, i_h
+        // * i_w)
+        // or
+        // (i_c, o_c * k_d * k_h * k_w) * (o_c * k_d * k_h * k_w, i_d * i_h *
+        // i_w) -> (i_c,
+        // i_d, i_h, i_w)
+        // gemm: dx = dy^T * filter^T for channel_last
+
+        std::vector<DenseTensor> dx_batch_vec;
+        for (int g = 0; g < groups; g++) {
+          // dx_slice: (i_c/g, i_h * i_w) or (i_c/g, i_d * i_h * i_w)
+          // for channel_first
+          // dx_slice: (i_h * i_w, i_c/g) or (i_d * i_h * i_w, i_c/g)
+          // for channel_last
+          // filter_slice: (i_c/g, o_c/g * k_h * k_w)
+          DenseTensor filter_slice =
+              filter_.Slice(g * in_step, (g + 1) * in_step);
+          // col_matrix_slice: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d *
+          // k_h * k_w, d * h * w)
+          DenseTensor col_matrix_slice =
+              col_matrix.Slice(g * col_step, (g + 1) * col_step);
+          if (data_layout != DataLayout::kNHWC) {
+            DenseTensor dx_slice =
+                dx_batch.Slice(g * in_step, (g + 1) * in_step);
+            blas.MatMul(filter_slice,
+                        false,
+                        col_matrix_slice,
+                        false,
+                        static_cast<T>(1.0),
+                        &dx_slice,
+                        static_cast<T>(0.0));
+          } else {
+            DenseTensor dx_slice;
+            funcs::Slice<Context, T, 2>(
+                ctx, &dx_batch, &dx_slice, g * in_step, (g + 1) * in_step, 1);
+            blas.MatMul(col_matrix_slice,
+                        true,
+                        filter_slice,
+                        true,
+                        static_cast<T>(1.0),
+                        &dx_slice,
+                        static_cast<T>(0.0));
+            DDim dx_slice_shape;
+            if (data_dim == 2U) {
+              dx_slice_shape = {x_dims[1], x_dims[2], in_step};
+            } else {
+              dx_slice_shape = {x_dims[1], x_dims[2], x_dims[3], in_step};
+            }
+            dx_slice = dx_slice.Resize(dx_slice_shape);
+            dx_batch_vec.push_back(dx_slice);
+          }
+        }
+        if (data_layout == DataLayout::kNHWC) {
+          concat_functor(ctx, dx_batch_vec, static_cast<int>(D - 2), &dx_batch);
+        }
+      }
+      if (dfilter) {
+        // x batch: (i_c, i_h * i_w) or (i_h, i_w * i_c)
+        DenseTensor in_batch = x.Slice(i, i + 1).Resize(x_matrix_shape);
+        // gemm: d_filter = x * dy^T
+        // (i_c, i_h * i_w) * (i_h * i_w, o_c * k_h * k_w) -> (i_c, o_c * k_h
+        // * k_w)
+        // or
+        // (i_c, i_d * i_h * i_w) * (i_d * i_h * i_w, o_c * k_d * k_h * k_w)
+        // -> (i_c, o_c * k_d *
+        // k_h * k_w)
+        // gemm: d_filter = x^T * dy^T for channel_last
+
+        for (int g = 0; g < groups; g++) {
+          DenseTensor dfilter_slice =
+              dfilter_.Slice(g * in_step, (g + 1) * in_step);
+          DenseTensor col_matrix_slice =
+              col_matrix.Slice(g * col_step, (g + 1) * col_step);
+          if (data_layout != DataLayout::kNHWC) {
+            DenseTensor in_batch_slice =
+                in_batch.Slice(g * in_step, (g + 1) * in_step);
+            blas.MatMul(in_batch_slice,
+                        false,
+                        col_matrix_slice,
+                        true,
+                        static_cast<T>(1.0),
+                        &dfilter_slice,
+                        static_cast<T>(1.0));
+          } else {
+            DenseTensor in_batch_slice;
+            funcs::Slice<Context, T, 2>(ctx,
+                                        &in_batch,
+                                        &in_batch_slice,
+                                        g * in_step,
+                                        (g + 1) * in_step,
+                                        1);
+            blas.MatMul(in_batch_slice,
+                        true,
+                        col_matrix_slice,
+                        true,
+                        static_cast<T>(1.0),
+                        &dfilter_slice,
+                        static_cast<T>(1.0));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Conv2dTransposeGradKernel(const Context& ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& filter,
+                               const DenseTensor& dout,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& paddings,
+                               const std::vector<int>& output_padding,
+                               const std::vector<int>& output_size,
+                               const std::string& padding_algorithm,
+                               int groups,
+                               const std::vector<int>& dilations,
+                               const std::string& data_format,
+                               DenseTensor* dx,
+                               DenseTensor* dfilter) {
+  ConvTransposeGradRawKernel<T, Context>(ctx,
+                                         x,
+                                         filter,
+                                         dout,
+                                         strides,
+                                         paddings,
+                                         padding_algorithm,
+                                         groups,
+                                         dilations,
+                                         data_format,
+                                         dx,
+                                         dfilter);
+}
+
+template <typename T, typename Context>
+void Conv3dTransposeGradKernel(const Context& ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& filter,
+                               const DenseTensor& dout,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& paddings,
+                               const std::vector<int>& output_padding,
+                               const std::vector<int>& output_size,
+                               const std::string& padding_algorithm,
+                               int groups,
+                               const std::vector<int>& dilations,
+                               const std::string& data_format,
+                               DenseTensor* dx,
+                               DenseTensor* dfilter) {
+  ConvTransposeGradRawKernel<T, Context>(ctx,
+                                         x,
+                                         filter,
+                                         dout,
+                                         strides,
+                                         paddings,
+                                         padding_algorithm,
+                                         groups,
+                                         dilations,
+                                         data_format,
+                                         dx,
+                                         dfilter);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h
new file mode 100644
index 0000000000000..ee2faf761fe32
--- /dev/null
+++ b/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h
@@ -0,0 +1,278 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/conv_transpose_kernel.h"
+
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/slice.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvTransposeRawKernel(const Context& ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& filter,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            const std::string& padding_algorithm,
+                            int groups,
+                            const std::vector<int>& dilations,
+                            const std::string& data_format,
+                            DenseTensor* out) {
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_format);
+  // The filter will be reshaped, so it should not be constant
+  DenseTensor filter_ = filter;
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ = dilations;
+
+  auto x_dims = x.dims();
+  auto filter_dims = filter_.dims();
+  auto out_dims = out->dims();
+  const int batch_size = static_cast<int>(x.dims()[0]);
+
+  DDim in_data_dims;
+  if (data_layout != DataLayout::kNHWC) {
+    in_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  } else {
+    in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  }
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
+
+  // x_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first
+  // x_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last
+  std::vector<int64_t> x_shape_vec = vectorize(x.dims());
+  // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
+  std::vector<int64_t> filter_shape_vec = vectorize(filter_.dims());
+
+  // use col_shape in the im2col and col2im (or vol2col and col2vol)
+  // calculation
+  // col_shape_vec: {o_c/g, k_h, k_w, h, w} or {o_c/g, k_d, k_h, k_w, d, h, w}
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  if (data_layout != DataLayout::kNHWC) {
+    col_shape_vec[0] = out_dims[1] / groups;
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 2];
+    }
+  } else {
+    col_shape_vec[0] = out_dims[out_dims.size() - 1] / groups;
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 1];
+    }
+  }
+  DDim col_shape(make_ddim(col_shape_vec));
+
+  // use col_matrix_shape in the gemm calculation
+  // size: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d * k_h * k_w, d * h * w)
+  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
+
+  DenseTensor col;
+  col.Resize(col_shape);
+  ctx.template Alloc<T>(&col);
+  // col_matrix shares the same piece of data with col,
+  // but will be reshaped into a two-dimensional matrix shape
+  // to call the matrix multiplication interface.
+  DenseTensor col_matrix;
+  col_matrix.ShareDataWith(col);
+  col_matrix.Resize(col_matrix_shape);
+
+  // out size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first
+  // out size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last
+  DDim out_shape = slice_ddim(out->dims(), 1, out->dims().size());
+
+  // x matrix size: (i_c, h * w) or (i_c, d * h * w) for channel_first
+  // x matrix size: (h * w, i_c) or (d * h * w, i_c) for channel_last
+  DDim x_matrix_shape;
+  if (data_layout != DataLayout::kNHWC) {
+    x_matrix_shape = {x_dims[1], col_matrix_shape[1]};
+  } else {
+    x_matrix_shape = {col_matrix_shape[1], x_dims[x_dims.size() - 1]};
+  }
+
+  // filter size: (i_c, o_c/g * k_h * k_w) or (i_c, o_c/g * k_d * k_h * k_w)
+  DDim filter_matrix_shape;
+  if (data_layout != DataLayout::kNHWC) {
+    filter_matrix_shape = {x_dims[1], col_matrix_shape[0]};
+  } else {
+    filter_matrix_shape = {x_dims[x_dims.size() - 1], col_matrix_shape[0]};
+  }
+  filter_.Resize(filter_matrix_shape);
+
+  ctx.template Alloc<T>(out);
+
+  funcs::SetConstant<Context, T> set_zero;
+
+  auto blas = funcs::GetBlas<Context, T>(ctx);
+  set_zero(ctx, out, static_cast<T>(0));
+
+  int in_step = (data_layout != DataLayout::kNHWC
+                     ? static_cast<int>(x_dims[1]) / groups
+                     : static_cast<int>(x_dims[x_dims.size() - 1]) / groups);
+
+  int out_step =
+      (data_layout != DataLayout::kNHWC
+           ? static_cast<int>(out_dims[1]) / groups
+           : static_cast<int>(out_dims[out_dims.size() - 1]) / groups);
+  paddle::operators::math::
+      Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+          col2im;
+  paddle::operators::math::Col2VolFunctor<Context, T> col2vol;
+  funcs::ConcatFunctor<Context, T> concat_functor;
+
+  // convolution transpose: gemm + col2im or col2vol (similar to conv-backward
+  // on x)
+  size_t D = x.dims().size();
+  for (int i = 0; i < batch_size; i++) {
+    // batch with size (i_c, h * w) or (i_c, d * h * w) for channel_first
+    // batch with size (h * w, i_c) or (d * h * w, i_c) for channel_last
+    DenseTensor x_batch = x.Slice(i, i + 1).Resize(x_matrix_shape);
+
+    // out size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first
+    // out size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last
+    DenseTensor out_batch = out->Slice(i, i + 1).Resize(out_shape);
+
+    std::vector<DenseTensor> out_batch_vec;
+    for (int g = 0; g < groups; g++) {
+      int64_t start = g * in_step;
+      int64_t end = (g + 1) * in_step;
+      int axes = (data_layout != DataLayout::kNHWC ? 0 : 1);
+      DenseTensor filter_slice = filter_.Slice(g * in_step, (g + 1) * in_step);
+      DenseTensor in_slice, out_slice;
+
+      // col_matrix = filter_slice * x_slice
+      // of shape (o_c/g * k_h * k_w, h * w)
+      // or (o_c/g * k_d * k_h * k_w, d * h * w)
+      if (data_layout != DataLayout::kNHWC) {
+        in_slice = x_batch.Slice(g * in_step, (g + 1) * in_step);
+        out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+        blas.MatMul(filter_slice,
+                    true,
+                    in_slice,
+                    false,
+                    static_cast<T>(1.0),
+                    &col_matrix,
+                    static_cast<T>(0.0));
+      } else {
+        funcs::Slice<Context, T, 2>(ctx, &x_batch, &in_slice, start, end, axes);
+        start = g * out_step;
+        end = (g + 1) * out_step;
+        axes = D - 2;
+        if (D == 4U) {
+          funcs::Slice<Context, T, 3>(
+              ctx, &out_batch, &out_slice, start, end, axes);
+        } else if (D == 5U) {
+          funcs::Slice<Context, T, 4>(
+              ctx, &out_batch, &out_slice, start, end, axes);
+        }
+        blas.MatMul(filter_slice,
+                    true,
+                    in_slice,
+                    true,
+                    static_cast<T>(1.0),
+                    &col_matrix,
+                    static_cast<T>(0.0));
+      }
+
+      if (data_dim == 2U) {
+        // col2im: col_matrix -> dy from (o_c/g * k_h * k_w, h * w) to (o_c/g,
+        // o_h, o_w) or (o_h, o_w, o_c/g)
+        col2im(ctx,
+               col,
+               dilations_,
+               strides,
+               std::vector<int>{
+                   paddings_[0], paddings_[2], paddings_[1], paddings_[3]},
+               &out_slice,
+               data_layout);
+      } else if (data_dim == 3U) {
+        // col2vol: col_matrix -> dy from (o_c/g * k_d * k_h * k_w, d * h * w)
+        // to (o_c/g, o_d, o_h, o_w) or (o_d, o_h, o_w, o_c/g)
+        col2vol(
+            ctx, col, dilations_, strides, paddings_, &out_slice, data_layout);
+      }
+      if (data_layout == DataLayout::kNHWC) {
+        out_batch_vec.push_back(out_slice);
+      }
+    }
+    if (data_layout == DataLayout::kNHWC) {
+      concat_functor(ctx, out_batch_vec, static_cast<int>(D - 2), &out_batch);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Conv2dTransposeKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& filter,
+                           const std::vector<int>& strides,
+                           const std::vector<int>& paddings,
+                           const std::vector<int>& output_padding,
+                           const std::vector<int>& output_size,
+                           const std::string& padding_algorithm,
+                           int groups,
+                           const std::vector<int>& dilations,
+                           const std::string& data_format,
+                           DenseTensor* out) {
+  ConvTransposeRawKernel<T, Context>(ctx,
+                                     x,
+                                     filter,
+                                     strides,
+                                     paddings,
+                                     padding_algorithm,
+                                     groups,
+                                     dilations,
+                                     data_format,
+                                     out);
+}
+
+template <typename T, typename Context>
+void Conv3dTransposeKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& filter,
+                           const std::vector<int>& strides,
+                           const std::vector<int>& paddings,
+                           const std::vector<int>& output_padding,
+                           const std::vector<int>& output_size,
+                           const std::string& padding_algorithm,
+                           int groups,
+                           const std::vector<int>& dilations,
+                           const std::string& data_format,
+                           DenseTensor* out) {
+  ConvTransposeRawKernel<T, Context>(ctx,
+                                     x,
+                                     filter,
+                                     strides,
+                                     paddings,
+                                     padding_algorithm,
+                                     groups,
+                                     dilations,
+                                     data_format,
+                                     out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
new file mode 100644
index 0000000000000..d8795808a643d
--- /dev/null
+++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
@@ -0,0 +1,173 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+
+template <typename T>
+HOSTDEVICE T DmcnIm2colBilinear(const T* bottom_data,
+                                const int data_width,
+                                const int height,
+                                const int width,
+                                T h,
+                                T w) {
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh;
+  T hw = 1 - lw;
+
+  T v1 =
+      (h_low >= 0 && w_low >= 0) ? bottom_data[h_low * data_width + w_low] : 0;
+  T v2 = (h_low >= 0 && w_high <= width - 1)
+             ? bottom_data[h_low * data_width + w_high]
+             : 0;
+  T v3 = (h_high <= height - 1 && w_low >= 0)
+             ? bottom_data[h_high * data_width + w_low]
+             : 0;
+  T v4 = (h_high <= height - 1 && w_high <= width - 1)
+             ? bottom_data[h_high * data_width + w_high]
+             : 0;
+
+  T w1 = hh * hw;
+  T w2 = hh * lw;
+  T w3 = lh * hw;
+  T w4 = lh * lw;
+
+  return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
+}
+
+template <typename T, typename Context>
+void ModulatedDeformableIm2col(const Context& dev_ctx,
+                               const T* data_im,
+                               const T* data_offset,
+                               const T* data_mask,
+                               const std::vector<int64_t>& im_shape,
+                               const std::vector<int64_t>& col_shape,
+                               const std::vector<int64_t>& filter_shape,
+                               const std::vector<int>& paddings,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& dilations,
+                               const int deformable_groups,
+                               T* data_col);
+
+template <typename T, typename Context>
+void DeformableConvKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& offset,
+                          const DenseTensor& filter,
+                          const DenseTensor& mask,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& paddings,
+                          const std::vector<int>& dilations,
+                          int deformable_groups,
+                          int groups,
+                          int im2col_step,
+                          DenseTensor* out) {
+  const int batch_size = static_cast<int>(x.dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(phi::vectorize(out->dims()));
+
+  // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w}
+  std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
+  col_buffer_shape_vec[0] = x.dims()[1] * filter.dims()[2] * filter.dims()[3];
+  col_buffer_shape_vec[1] = im2col_step;
+  for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
+    col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
+  }
+
+  std::vector<int64_t> output_buffer_shape_vec(1);
+  output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
+                               output_shape_vec[2] * output_shape_vec[3];
+
+  DenseTensor col_buffer = Empty<T>(dev_ctx, col_buffer_shape_vec);
+  DenseTensor output_buffer = Empty<T>(dev_ctx, output_buffer_shape_vec);
+
+  int64_t M = output_shape_vec[1] / groups;
+  int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
+  int64_t K = x.dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups;
+
+  DenseTensor weight_3d;
+  weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K}));
+
+  DenseTensor col_buffer_3d;
+  col_buffer_3d.ShareDataWith(col_buffer)
+      .Resize(phi::make_ddim({groups, K, N}));
+
+  DenseTensor output_4d;
+  output_4d.ShareDataWith(output_buffer)
+      .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N}));
+
+  DDim input_shape = phi::slice_ddim(x.dims(), 1, x.dims().size());
+  std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
+
+  int input_dim = x.numel() / x.dims()[0];
+  int input_offset_dim = offset.numel() / offset.dims()[0];
+  int input_mask_dim = mask.numel() / mask.dims()[0];
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+
+  const T* input_ptr = x.data<T>();
+  const T* offset_ptr = offset.data<T>();
+  const T* mask_ptr = mask.data<T>();
+  T* col_buffer_ptr = col_buffer.data<T>();
+
+  for (int i = 0; i < batch_size / im2col_step; ++i) {
+    ModulatedDeformableIm2col(dev_ctx,
+                              input_ptr + i * im2col_step * input_dim,
+                              offset_ptr + i * im2col_step * input_offset_dim,
+                              mask_ptr + i * im2col_step * input_mask_dim,
+                              input_shape_vec,
+                              col_buffer_shape_vec,
+                              filter_shape_vec,
+                              paddings,
+                              strides,
+                              dilations,
+                              deformable_groups,
+                              col_buffer_ptr);
+    DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize(
+        phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size()));
+    // get the product of pixel and weight
+    for (int g = 0; g < groups; ++g) {
+      DenseTensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
+          phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
+      DenseTensor col_buffer_3d_slice =
+          col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
+              col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
+      DenseTensor output_3d_slice = output_3d.Slice(g, g + 1).Resize(
+          phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size()));
+      blas.MatMul(weight_3d_slice,
+                  false,
+                  col_buffer_3d_slice,
+                  false,
+                  T(1.0),
+                  &output_3d_slice,
+                  T(0.0));
+    }
+  }
+  out->ShareDataWith(output_buffer).Resize(phi::make_ddim(output_shape_vec));
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
new file mode 100644
index 0000000000000..e4356e9af3937
--- /dev/null
+++ b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
@@ -0,0 +1,159 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/determinant_grad_kernel.h"
+
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+namespace detail {
+
+template <typename T>
+struct FoundZeroFunctor {
+  FoundZeroFunctor(const T* x, int64_t numel, bool* res)
+      : x_(x), numel_(numel), res_(res) {}
+  HOSTDEVICE void operator()(size_t idx) const {
+    if (*res_ || idx >= static_cast<size_t>(numel_)) {
+      // founded zero number
+      return;
+    }
+    *res_ = (x_[idx] == static_cast<T>(0));
+  }
+  const T* x_;
+  int64_t numel_;
+  bool* res_;
+};
+
+template <typename T, typename Context>
+inline bool CheckMatrixInvertible(const Context& dev_ctx,
+                                  const DenseTensor* det) {
+  auto numel = det->numel();
+
+  DenseTensor dev_tensor = phi::Empty<bool, Context>(dev_ctx, {1});
+
+  // set false
+  phi::funcs::SetConstant<Context, bool> zero;
+  zero(dev_ctx, &dev_tensor, false);
+
+  // find whether zero
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  FoundZeroFunctor<T> functor(det->data<T>(), numel, dev_tensor.data<bool>());
+  for_range(functor);
+
+  // copy to host
+  DenseTensor cpu_tensor;
+  phi::Copy<Context>(dev_ctx, dev_tensor, phi::CPUPlace(), false, &cpu_tensor);
+
+  // if founded zero, the matrix is not invertible
+  // else the matrix is invertible
+  auto* res = cpu_tensor.data<bool>();
+  return !(*res);
+}
+
+}  // namespace detail
+
+template <typename T, typename Context>
+void DeterminantGradKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& out,
+                           const DenseTensor& out_grad,
+                           DenseTensor* x_grad) {
+  auto input_dims_size = x.dims().size();
+  if (input_dims_size > 2) {
+    PADDLE_ENFORCE_EQ(
+        out_grad.dims().size() + 2,
+        input_dims_size,
+        phi::errors::InvalidArgument(
+            "The grad tensor of det dims size should be 2 less than"
+            " input tensor's, but here differ %d",
+            input_dims_size - out_grad.dims().size()));
+  } else if (input_dims_size == 2) {
+    // input dims size 2 and grad dims size 1 is possible
+    PADDLE_ENFORCE_EQ(
+        out_grad.dims().size(),
+        1,
+        phi::errors::InvalidArgument(
+            "The grad tensor of det dims size should be 2 less than"
+            " input tensor's, but here differ %d",
+            input_dims_size - out_grad.dims().size()));
+  } else {
+    // checked in forward, pass
+  }
+
+  // Check Whether the matrix is invertible
+  // (matrix A not invertible) == (det(A)=0)
+  if (!detail::CheckMatrixInvertible<T, Context>(dev_ctx, &out)) {
+    // The matrix is not invertible
+    VLOG(3) << "The input matrix not invertible!";
+    x_grad->Resize(x.dims());
+    phi::Full<T>(
+        dev_ctx, phi::vectorize(x.dims()), static_cast<T>(0.0f), x_grad);
+    return;
+  }
+
+  // The matrix is invertible
+  // let |A| = Determinant(A)
+  // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf
+  // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2,
+  // -1)
+
+  // First: inverse(A)
+  DenseTensor inverse_A;
+  // A must be square matrices!
+  inverse_A.Resize(x.dims());
+  dev_ctx.template Alloc<T>(&inverse_A);
+
+  phi::funcs::MatrixInverseFunctor<Context, T> mat_inv;
+  mat_inv(dev_ctx, x, &inverse_A);
+
+  VLOG(3) << "inverse(A) dims: " << inverse_A.dims();
+
+  // Second: inverse(A).transpose(-2, -1)
+  DenseTensor transpose_inverse_A =
+      phi::TransposeLast2Dim<T>(dev_ctx, inverse_A);
+
+  VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: "
+          << transpose_inverse_A.dims();
+
+  // Third: dA * |A|
+  auto mul_dA_detA = phi::Multiply<T>(dev_ctx, out_grad, out);
+  VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims();
+
+  // Fourth: unsqueeze(dA * |A|, [-1, -2])
+  auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1);
+  auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2);
+  VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims();
+
+  // Finally: unsqueeze(dA * |A|) * inverse(A)
+  auto res = phi::Multiply<T>(dev_ctx, unsqueeze2, transpose_inverse_A);
+
+  VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims();
+
+  x_grad->Resize(x.dims());
+  VLOG(3) << "d|A| dims: " << x_grad->dims();
+
+  phi::Copy(dev_ctx, res, dev_ctx.GetPlace(), false, x_grad);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/determinant_kernel_impl.h b/paddle/phi/kernels/impl/determinant_kernel_impl.h
new file mode 100644
index 0000000000000..f3a611b89c95c
--- /dev/null
+++ b/paddle/phi/kernels/impl/determinant_kernel_impl.h
@@ -0,0 +1,124 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/determinant_kernel.h"
+
+#include <Eigen/Dense>
+#include <Eigen/LU>
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "paddle/phi/core/enforce.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace phi {
+namespace detail {
+template <typename T>
+class EigenMatrix {};
+
+template <>
+class EigenMatrix<float> {
+ public:
+  using MatrixType = Eigen::MatrixXf;
+};
+
+template <>
+class EigenMatrix<double> {
+ public:
+  using MatrixType = Eigen::MatrixXd;
+};
+
+inline int64_t GetBatchCount(const DDim dims) {
+  int64_t batch_count = 1;
+  auto dim_size = dims.size();
+  PADDLE_ENFORCE_GE(
+      dim_size,
+      2,
+      phi::errors::InvalidArgument(
+          "the input matrix dimension size should greater than 2."));
+
+  // Cumulative multiplying each dimension until the last 2 to get the batch
+  // count,
+  // for example a tensor with shape [3,3,3,3], the batch count of matrices is
+  // 9.
+  for (int64_t i = 0; i < dims.size() - 2; i++) {
+    batch_count *= dims[i];
+  }
+
+  return batch_count;
+}
+}  // namespace detail
+
+template <typename T, typename Context>
+struct DeterminantFunctor {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  int64_t rank,
+                  int64_t batch_count,
+                  DenseTensor* output) {
+    std::vector<T> input_vec;
+    std::vector<T> output_vec;
+    paddle::framework::TensorToVector(input, dev_ctx, &input_vec);
+    for (int64_t i = 0; i < batch_count; ++i) {  // maybe can be parallel
+      auto begin_iter = input_vec.begin() + i * rank * rank;
+      auto end_iter = input_vec.begin() + (i + 1) * rank * rank;
+      std::vector<T> sub_vec(begin_iter,
+                             end_iter);  // get every square matrix data
+      typename detail::EigenMatrix<T>::MatrixType matrix(rank, rank);
+      for (int64_t i = 0; i < rank; ++i) {
+        for (int64_t j = 0; j < rank; ++j) {
+          matrix(i, j) = sub_vec[rank * i + j];
+        }
+      }
+      output_vec.push_back(matrix.determinant());
+    }
+    paddle::framework::TensorFromVector(output_vec, output);
+  }
+};
+
+template <typename T, typename Context>
+void DeterminantKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       DenseTensor* out) {
+  auto input_dim = vectorize(x.dims());
+  auto input_dim_size = input_dim.size();
+
+  auto batch_count = detail::GetBatchCount(x.dims());
+  VLOG(10) << "input dim:" << x.dims();
+  PADDLE_ENFORCE_GE(
+      input_dim_size,
+      2,
+      phi::errors::InvalidArgument(
+          "the input matrix dimension size should greater than 2."));
+  PADDLE_ENFORCE_EQ(input_dim[input_dim_size - 1],
+                    input_dim[input_dim_size - 2],
+                    phi::errors::InvalidArgument(
+                        "the input matrix should be square matrix."));
+  auto rank = input_dim[input_dim_size - 1];  // square matrix length
+  DeterminantFunctor<T, Context>()(dev_ctx, x, rank, batch_count, out);
+  auto output_dims = phi::slice_ddim(x.dims(), 0, input_dim_size - 2);
+  if (input_dim_size > 2) {
+    out->Resize(output_dims);
+  } else {
+    // when input is a two-dimension matrix, The det value is a number.
+    out->Resize({1});
+  }
+  VLOG(10) << "output dim:" << out->dims();
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
index 2f0530b638f5e..5e06435b28e27 100644
--- a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
@@ -15,21 +15,15 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/device_context.h"
-#include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/full_kernel.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
-
 namespace phi {
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index e8831f90213b6..0b7a5d3bcb26a 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -258,5 +258,374 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
     dout_result.device(place) = static_cast<T>(-1) * dout_result;
   }
 }
+template <typename T, typename Context>
+void ElementwiseFMaxGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out_grad,
+                               int axis,
+                               DenseTensor* x_grad,
+                               DenseTensor* y_grad) {
+  funcs::ElementwiseGradPreProcess(out_grad, x_grad);
+
+  auto out = out_grad;  // Fake out, not used
+  auto x_dim = x.dims();
+  auto y_dim = y.dims();
+  if (x.dims() == y.dims()) {
+    funcs::ElemwiseGradComputeNoBroadcast<Context,
+                                          T,
+                                          funcs::FMaxGradDx<T>,
+                                          funcs::FMaxGradDy<T>>(
+        dev_ctx,
+        x_dim,
+        y_dim,
+        x,
+        y,
+        out,
+        out_grad,
+        axis,
+        x_grad,
+        y_grad,
+        funcs::FMaxGradDx<T>(),
+        funcs::FMaxGradDy<T>());
+  } else {
+    funcs::ElemwiseGradComputeWithBroadcast<T,
+                                            funcs::FMaxGradDx<T>,
+                                            funcs::FMaxGradDy<T>>(
+        dev_ctx,
+        x_dim,
+        y_dim,
+        x,
+        y,
+        out,
+        out_grad,
+        axis,
+        x_grad,
+        y_grad,
+        funcs::FMaxGradDx<T>(),
+        funcs::FMaxGradDy<T>());
+  }
+}
+
+template <typename T, typename Context>
+void ElementwiseFMinGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out_grad,
+                               int axis,
+                               DenseTensor* x_grad,
+                               DenseTensor* y_grad) {
+  funcs::ElementwiseGradPreProcess(out_grad, x_grad);
+  auto out = out_grad;  // Fake out, not used
+  auto x_dim = x.dims();
+  auto y_dim = y.dims();
+  if (x.dims() == y.dims()) {
+    funcs::ElemwiseGradComputeNoBroadcast<Context,
+                                          T,
+                                          funcs::FMinGradDx<T>,
+                                          funcs::FMinGradDy<T>>(
+        dev_ctx,
+        x_dim,
+        y_dim,
+        x,
+        y,
+        out,
+        out_grad,
+        axis,
+        x_grad,
+        y_grad,
+        funcs::FMinGradDx<T>(),
+        funcs::FMinGradDy<T>());
+  } else {
+    funcs::ElemwiseGradComputeWithBroadcast<T,
+                                            funcs::FMinGradDx<T>,
+                                            funcs::FMinGradDy<T>>(
+        dev_ctx,
+        x_dim,
+        y_dim,
+        x,
+        y,
+        out,
+        out_grad,
+        axis,
+        x_grad,
+        y_grad,
+        funcs::FMinGradDx<T>(),
+        funcs::FMinGradDy<T>());
+  }
+}
+
+template <typename T>
+struct MulGradDX {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
+};
+
+template <typename T>
+struct MulGradDX<phi::dtype::complex<T>> {
+  HOSTDEVICE phi::dtype::complex<T> operator()(
+      phi::dtype::complex<T> x,
+      phi::dtype::complex<T> y,
+      phi::dtype::complex<T> out,
+      phi::dtype::complex<T> dout) const {
+    phi::dtype::complex<T> y_conj(y.real, -y.imag);
+    return dout * y_conj;
+  }
+};
+
+/*
+******************************
+    Multiply Grad
+******************************
+*/
+
+template <typename T>
+struct MulGradDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
+};
+
+template <typename T>
+struct MulGradDY<phi::dtype::complex<T>> {
+  HOSTDEVICE phi::dtype::complex<T> operator()(
+      phi::dtype::complex<T> x,
+      phi::dtype::complex<T> y,
+      phi::dtype::complex<T> out,
+      phi::dtype::complex<T> dout) const {
+    phi::dtype::complex<T> x_conj(x.real, -x.imag);
+    return dout * x_conj;
+  }
+};
+
+template <typename T, typename Context>
+void MultiplyDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              int axis,
+                              DenseTensor* dx,
+                              DenseTensor* dy,
+                              DenseTensor* ddout) {
+  if (ddout) dev_ctx.template Alloc<T>(ddout);
+
+  DenseTensor ddx_safe, ddy_safe;
+  funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, x, ddx.get_ptr(), &ddx_safe);
+  funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, y, ddy.get_ptr(), &ddy_safe);
+
+  // dx = dout * ddy
+  // dy = dout * ddx
+  // ddout = ddx * y + x * ddy
+  // change computation sequence to save memory, so ddout can inplace ddx and
+  // dx can be used as 'tmp' tensor
+  // (1) dx = x * ddy
+  // (2) dy = dout * ddx
+  // (3) ddout = ddx * y
+  // (4) ddout = ddout + dx
+  // (5) dx = dout * ddy
+  if (ddout) {
+    auto& place = *dev_ctx.eigen_device();
+    // size(ddout) > size(ddx), ddout can't use memory of ddx using inplace
+    if (ddout->numel() > ddx.get_ptr()->numel()) {
+      phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+          dev_ctx,
+          ddx_safe,
+          ddy_safe,
+          dout,
+          dout,
+          axis,
+          dx,
+          dy,
+          MulGradDX<T>(),
+          MulGradDY<T>());
+
+      DenseTensor ddout_tmp;
+      ddout_tmp.Resize(ddout->dims());
+      dev_ctx.template Alloc<T>(&ddout_tmp);
+
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, y, ddx_safe, ddout, axis);
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, ddy_safe, x, &ddout_tmp, axis);
+
+      auto ddout_t = phi::EigenVector<T>::Flatten(*ddout);
+      auto ddout_tmp_t = phi::EigenVector<T>::Flatten(ddout_tmp);
+      ddout_t.device(place) = ddout_t + ddout_tmp_t;
+    } else {
+      // use dx to save memory, other than alloc tmp tensor
+      DenseTensor* ddout_tmp = dx;
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, x, ddy_safe, ddout_tmp, axis);
+      // NOTE: in the following ElemwiseGradCompute, for the
+      // first output tensor is nullptr, the branch to calculate first
+      // output tensor will not be activated, DivGradDx function will not
+      // be called and can be ignored, the first branch has little effect
+      // on running speed.
+      phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+          dev_ctx,
+          ddx_safe,
+          ddy_safe,
+          dout,
+          dout,
+          axis,
+          nullptr,
+          dy,
+          MulGradDX<T>(),
+          MulGradDY<T>());
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, ddx_safe, y, ddout, axis);
+
+      auto ddout_t = phi::EigenVector<T>::Flatten(*ddout);
+      auto ddout_tmp_t = phi::EigenVector<T>::Flatten(*ddout_tmp);
+      ddout_t.device(place) = ddout_t + ddout_tmp_t;
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, dout, ddy_safe, dx, axis);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void MultiplyTripleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& d_dx,
+                              const DenseTensor& d_dy,
+                              paddle::optional<const DenseTensor&> d_ddout,
+                              int axis,
+                              DenseTensor* d_x,
+                              DenseTensor* d_y,
+                              DenseTensor* d_dout,
+                              DenseTensor* d_ddx,
+                              DenseTensor* d_ddy) {
+  if (d_x) {
+    d_x->Resize(x.dims());
+    dev_ctx.template Alloc<T>(d_x);
+  }
+  if (d_y) {
+    d_y->Resize(y.dims());
+    dev_ctx.template Alloc<T>(d_y);
+  }
+  if (d_dout) {
+    d_dout->Resize(dout.dims());
+    dev_ctx.template Alloc<T>(d_dout);
+  }
+  if (d_ddx) {
+    d_ddx->Resize(x.dims());
+    dev_ctx.template Alloc<T>(d_ddx);
+  }
+  if (d_ddy) {
+    d_ddy->Resize(y.dims());
+    dev_ctx.template Alloc<T>(d_ddy);
+  }
+
+  auto& place = *dev_ctx.eigen_device();
+
+  DenseTensor ddx_safe, ddy_safe;
+  funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, x, ddx.get_ptr(), &ddx_safe);
+  funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, y, ddy.get_ptr(), &ddy_safe);
+
+  if (d_ddout.get_ptr()) {
+    if (d_x) {
+      // d_x = ddy * d_ddout
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, ddy_safe, *(d_ddout.get_ptr()), d_x, axis);
+    }
+    if (d_y) {
+      // d_y = ddx * d_ddout
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, ddx_safe, *(d_ddout.get_ptr()), d_y, axis);
+    }
+  }
+
+  if (d_dout) {
+    // get d_dout
+    // d_dout = ddy * d_dx + d_dy * ddx
+    DenseTensor d_dout_tmp;
+    d_dout_tmp.Resize(dout.dims());
+    dev_ctx.template Alloc<T>(&d_dout_tmp);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, d_dy, ddx_safe, d_dout, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, ddy_safe, d_dx, &d_dout_tmp, axis);
+    auto d_dout_t = phi::EigenVector<T>::Flatten(*d_dout);
+    auto d_dout_tmp_t = phi::EigenVector<T>::Flatten(d_dout_tmp);
+    d_dout_t.device(place) = d_dout_t + d_dout_tmp_t;
+  }
+
+  if (d_ddx) {
+    // get d_ddx
+    // d_ddx = dout * d_dy + y * d_ddout
+    DenseTensor d_ddx_tmp;
+    d_ddx_tmp.Resize(ddx->dims());
+    dev_ctx.template Alloc<T>(&d_ddx_tmp);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, dout, d_dy, d_ddx, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, y, *(d_ddout.get_ptr()), &d_ddx_tmp, axis);
+    auto d_ddx_t = phi::EigenVector<T>::Flatten(*d_ddx);
+    auto d_ddx_tmp_t = phi::EigenVector<T>::Flatten(d_ddx_tmp);
+    d_ddx_t.device(place) = d_ddx_t + d_ddx_tmp_t;
+  }
+
+  if (d_ddy) {
+    // get d_ddy
+    // d_ddy = dout * d_dx + x * d_ddout
+    DenseTensor d_ddy_tmp;
+    d_ddy_tmp.Resize(ddy->dims());
+    dev_ctx.template Alloc<T>(&d_ddy_tmp);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, dout, d_dx, d_ddy, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, x, *(d_ddout.get_ptr()), &d_ddy_tmp, axis);
+    auto d_ddy_t = phi::EigenVector<T>::Flatten(*d_ddy);
+    auto d_ddy_tmp_t = phi::EigenVector<T>::Flatten(d_ddy_tmp);
+    d_ddy_t.device(place) = d_ddy_t + d_ddy_tmp_t;
+  }
+}
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
new file mode 100644
index 0000000000000..775a91bf026d2
--- /dev/null
+++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#endif
+
+namespace phi {
+template <typename T, typename Context>
+void ElementwiseFMaxKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           int axis,
+                           DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  funcs::ElementwiseCompute<funcs::FMaxFunctor<T>, T, T>(
+      dev_ctx, x, y, axis, funcs::FMaxFunctor<T>(), out);
+}
+
+template <typename T, typename Context>
+void ElementwiseFMinKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           int axis,
+                           DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  funcs::ElementwiseCompute<funcs::FMinFunctor<T>, T, T>(
+      dev_ctx, x, y, axis, funcs::FMinFunctor<T>(), out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/erf_grad_kernel_impl.h b/paddle/phi/kernels/impl/erf_grad_kernel_impl.h
new file mode 100644
index 0000000000000..5908d9d7dcb50
--- /dev/null
+++ b/paddle/phi/kernels/impl/erf_grad_kernel_impl.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/erf_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out_grad,
+                   DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+
+  auto eigen_x = EigenVector<T>::Flatten(x);
+  auto eigen_dout = EigenVector<T>::Flatten(out_grad);
+  auto eigen_dx = EigenVector<T>::Flatten(*x_grad);
+  auto& place = *dev_ctx.eigen_device();
+  phi::funcs::EigenErfGrad<std::decay_t<decltype(place)>, T>::Eval(
+      place, eigen_dx, eigen_x, eigen_dout);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/erf_kernel_impl.h b/paddle/phi/kernels/impl/erf_kernel_impl.h
new file mode 100644
index 0000000000000..aa1f4d349ab71
--- /dev/null
+++ b/paddle/phi/kernels/impl/erf_kernel_impl.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/erf_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+
+  auto eigen_out = EigenVector<T>::Flatten(*out);
+  auto eigen_in = EigenVector<T>::Flatten(x);
+  auto& place = *dev_ctx.eigen_device();
+  phi::funcs::EigenErf<std::decay_t<decltype(place)>, T>::Eval(
+      place, eigen_out, eigen_in);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h b/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h
new file mode 100644
index 0000000000000..6ef282d470333
--- /dev/null
+++ b/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h
@@ -0,0 +1,129 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
+
+namespace phi {
+template <typename Context, typename T, int Dims>
+void ExpandAsBackward(const Context& ctx,
+                      const DenseTensor& out_grad,
+                      const std::vector<int>& reshape_dims_vec,
+                      const std::vector<int>& reduce_dims_vec,
+                      DenseTensor* in_grad) {
+  size_t reshape_size = reshape_dims_vec.size();
+  size_t reduce_size = reduce_dims_vec.size();
+  ctx.template Alloc<T>(in_grad);
+  auto x_grad = EigenVector<T>::Flatten(*in_grad);
+  Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
+  for (size_t i = 0; i < reshape_size; ++i) {
+    reshape_dims[i] = reshape_dims_vec[i];
+  }
+  Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
+  for (size_t i = 0; i < reduce_size; ++i) {
+    reduce_dims[i] = reduce_dims_vec[i];
+  }
+  auto out_grad0 = EigenVector<T>::Flatten(out_grad);
+  auto& place = *ctx.eigen_device();
+  funcs::EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+      place, x_grad, out_grad0, reduce_dims, reshape_dims);
+}
+
+template <typename T, typename Context>
+void ExpandAsGradKernel(const Context& context,
+                        const DenseTensor& x,
+                        const DenseTensor& out_grad,
+                        const std::vector<int>& target_shape,
+                        DenseTensor* in_grad) {
+  auto x_dims = x.dims();
+  auto vec_in_dims = phi::vectorize<int>(x_dims);
+  auto diff = target_shape.size() - vec_in_dims.size();
+  vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+  std::vector<int> repeat_times(vec_in_dims.size());
+  for (size_t i = 0; i < vec_in_dims.size(); ++i) {
+    repeat_times[i] = target_shape[i] / vec_in_dims[i];
+  }
+  std::vector<int> reshape_dims_vec;
+  std::vector<int> reduce_dims_vec;
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    reduce_dims_vec.push_back(reshape_dims_vec.size());
+    reshape_dims_vec.push_back(repeat_times[i]);
+    reshape_dims_vec.push_back(vec_in_dims[i]);
+  }
+
+  int dims = reduce_dims_vec.size();
+  bool just_copy = true;
+  for (size_t i = 0; i < repeat_times.size(); i++) {
+    if (repeat_times[i] != 1) {
+      just_copy = false;
+      break;
+    }
+  }
+  // no need reduce, just copy
+  if (just_copy) {
+    context.template Alloc<T>(in_grad);
+    phi::Copy(context, out_grad, context.GetPlace(), false, in_grad);
+  } else {
+    PADDLE_ENFORCE_GE(
+        dims,
+        1,
+        errors::InvalidArgument("The rank of the input 'Out@GRAD' for "
+                                "expand_as_v2_grad op must be greater than or "
+                                "equal to 1, but the value received is %d.",
+                                dims));
+    PADDLE_ENFORCE_LE(dims,
+                      MAX_RANK_SUPPORTED,
+                      errors::InvalidArgument(
+                          "The rank of the input 'Out@GRAD' for "
+                          "expand_as_v2_grad op must be less than or equal "
+                          "to %d, but the value received is %d.",
+                          MAX_RANK_SUPPORTED,
+                          dims));
+    switch (dims) {
+      case 1:
+        ExpandAsBackward<Context, T, 1>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      case 2:
+        ExpandAsBackward<Context, T, 2>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      case 3:
+        ExpandAsBackward<Context, T, 3>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      case 4:
+        ExpandAsBackward<Context, T, 4>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      case 5:
+        ExpandAsBackward<Context, T, 5>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      case 6:
+        ExpandAsBackward<Context, T, 6>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      default:
+        PADDLE_THROW(errors::InvalidArgument(
+            "Only support tensor with rank being between 1 and 6. But "
+            "received tensor's rank = %d.",
+            dims));
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/expand_as_kernel_impl.h b/paddle/phi/kernels/impl/expand_as_kernel_impl.h
new file mode 100644
index 0000000000000..e5138e4e12c05
--- /dev/null
+++ b/paddle/phi/kernels/impl/expand_as_kernel_impl.h
@@ -0,0 +1,145 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+namespace phi {
+
+template <typename Context, typename T, int Rank>
+void ExpandAs(const Context& context,
+              const DenseTensor& x,
+              const std::vector<int>& target_shape,
+              DenseTensor* out) {
+  auto in_dims = x.dims();
+  auto vec_in_dims = phi::vectorize<int>(in_dims);
+  auto diff = target_shape.size() - vec_in_dims.size();
+  vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+  std::vector<int> repeat_times(vec_in_dims.size());
+  for (size_t i = 0; i < vec_in_dims.size(); ++i) {
+    PADDLE_ENFORCE_NE(
+        target_shape[i],
+        0,
+        errors::InvalidArgument("The value of target shape cannot be zero."));
+    if (i < diff) {
+      PADDLE_ENFORCE_GT(
+          target_shape[i],
+          0,
+          errors::InvalidArgument(
+              "The expanded size (%d) for non-existing dimensions must be "
+              "positive for expand_as_v2 op.",
+              target_shape[i]));
+      repeat_times[i] = target_shape[i];
+    } else if (target_shape[i] > 0) {
+      if (vec_in_dims[i] != 1) {
+        PADDLE_ENFORCE_EQ(
+            vec_in_dims[i],
+            target_shape[i],
+            errors::InvalidArgument(
+                "The value (%d) of the non-singleton dimension does not match"
+                " the corresponding value (%d) in shape for expand_as_v2 op.",
+                vec_in_dims[i],
+                target_shape[i]));
+        repeat_times[i] = 1;
+      } else {
+        repeat_times[i] = target_shape[i];
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(
+          target_shape[i],
+          -1,
+          errors::InvalidArgument(
+              "When the value in shape is negative for expand_as_v2 op, "
+              "only -1 is supported, but the value received is %d.",
+              target_shape[i]));
+      repeat_times[i] = 1;
+    }
+  }
+  Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    bcast_dims[i] = repeat_times[i];
+  }
+
+  phi::DDim new_in_dims = phi::make_ddim(vec_in_dims);
+  phi::DDim out_dims = phi::make_ddim(target_shape);
+
+  out->Resize(out_dims);
+  context.template Alloc<T>(out);
+  auto x0 = EigenTensor<T, Rank>::From(x, new_in_dims);
+  auto y = EigenTensor<T, Rank>::From(*out, out_dims);
+  auto& place = *context.eigen_device();
+  funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+      place, y, x0, bcast_dims);
+}
+
+template <typename T, typename Context>
+void ExpandAsKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    paddle::optional<const DenseTensor&> y,
+                    const std::vector<int>& target_shape,
+                    DenseTensor* out) {
+  auto rank = x.dims().size();
+  auto target_rank = target_shape.size();
+  PADDLE_ENFORCE_GE(target_rank,
+                    rank,
+                    errors::InvalidArgument(
+                        "The rank (%d) of the input 'target_tensor' for "
+                        "expand_as_v2 op must be greater than or equal to "
+                        "the rank (%d) of the input 'x'.",
+                        target_rank,
+                        rank));
+  PADDLE_ENFORCE_GE(
+      rank,
+      1,
+      errors::InvalidArgument("The rank (%d) of the input 'x' for "
+                              "expand_as_v2 op must be positive.",
+                              rank));
+  PADDLE_ENFORCE_LE(target_rank,
+                    MAX_RANK_SUPPORTED,
+                    errors::InvalidArgument(
+                        "The rank (%d) of the input 'target_tensor' for "
+                        "expand_as_v2 op must be less than or equal to %d.",
+                        target_rank,
+                        MAX_RANK_SUPPORTED));
+
+  switch (target_rank) {
+    case 1:
+      ExpandAs<Context, T, 1>(ctx, x, target_shape, out);
+      break;
+    case 2:
+      ExpandAs<Context, T, 2>(ctx, x, target_shape, out);
+      break;
+    case 3:
+      ExpandAs<Context, T, 3>(ctx, x, target_shape, out);
+      break;
+    case 4:
+      ExpandAs<Context, T, 4>(ctx, x, target_shape, out);
+      break;
+    case 5:
+      ExpandAs<Context, T, 5>(ctx, x, target_shape, out);
+      break;
+    case 6:
+      ExpandAs<Context, T, 6>(ctx, x, target_shape, out);
+      break;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h b/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h
new file mode 100644
index 0000000000000..65d903a7fe426
--- /dev/null
+++ b/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/frobenius_norm_grad_kernel.h"
+
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/reduce_grad.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void FrobeniusNormGradKernel(const Context& ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& out,
+                             const DenseTensor& dout,
+                             const std::vector<int64_t>& axis,
+                             bool keep_dim,
+                             bool reduce_all,
+                             DataType in_dtype,
+                             DataType out_dtype,
+                             DenseTensor* dx) {
+  ReduceGradKernel<Context, T, funcs::FrobeniusNormGradFunctor>(
+      ctx, x, dout, out, axis, keep_dim, reduce_all, in_dtype, out_dtype, dx);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/reduce_max_kernel.cc b/paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h
similarity index 55%
rename from paddle/phi/kernels/cpu/reduce_max_kernel.cc
rename to paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h
index f9ea0aa0faf06..8577a4e3c6345 100644
--- a/paddle/phi/kernels/cpu/reduce_max_kernel.cc
+++ b/paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h
@@ -12,28 +12,24 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_max_kernel.h"
+#pragma once
+
+#include "paddle/phi/kernels/frobenius_norm_kernel.h"
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/reduce.h"
 #include "paddle/phi/kernels/funcs/reduce_functor.h"
 
 namespace phi {
 
 template <typename T, typename Context>
-void MaxRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
-                  bool keep_dim,
-                  bool reduce_all,
-                  DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  phi::Reduce<CPUContext, T, phi::funcs::MaxFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+void FrobeniusNormKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const std::vector<int64_t>& axis,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DenseTensor* out) {
+  Reduce<Context, T, funcs::FrobeniusNormFunctor>(
+      ctx, x, reduce_all, axis, keep_dim, x.dtype(), out);
 }
 
 }  // namespace phi
-
-PD_REGISTER_KERNEL(
-    max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/impl/isclose_kernel_impl.h b/paddle/phi/kernels/impl/isclose_kernel_impl.h
new file mode 100644
index 0000000000000..25247ceaff6c0
--- /dev/null
+++ b/paddle/phi/kernels/impl/isclose_kernel_impl.h
@@ -0,0 +1,176 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cmath>
+#include <string>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+// TODO(xiongkun): remove the header when decouple the memcpy function in phi.
+#include "paddle/fluid/memory/memcpy.h"
+
+namespace phi {
+using Tensor = DenseTensor;
+template <typename DeviceContext, typename T>
+struct GetTensorValue {
+  T operator()(const DeviceContext& ctx, const DenseTensor& tensor) const;
+};
+
+template <typename DeviceContext, typename T>
+struct IscloseFunctor {
+  void operator()(const DeviceContext& ctx,
+                  const DenseTensor& in,
+                  const DenseTensor& other,
+                  const float rtol,
+                  const float atol,
+                  bool equal_nan,
+                  DenseTensor* output);
+};
+
+template <typename T>
+struct GetTensorValue<phi::CPUContext, T> {
+  T operator()(const phi::CPUContext& dev_ctx,
+               const DenseTensor& tensor) const {
+    return *(tensor.data<T>());
+  }
+};
+
+template <typename T>
+struct GetTensorValue<phi::GPUContext, T> {
+  T operator()(const phi::GPUContext& dev_ctx,
+               const DenseTensor& tensor) const {
+    const T* data = tensor.data<T>();
+    T value;
+    const auto gpu_place = dev_ctx.GetPlace();
+    paddle::memory::Copy(
+        phi::CPUPlace(), &value, gpu_place, data, sizeof(T), dev_ctx.stream());
+    return value;
+  }
+};
+
+template <typename T>
+struct IscloseFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& ctx,
+                  const DenseTensor& in,
+                  const DenseTensor& other,
+                  const double rtol,
+                  const double atol,
+                  bool equal_nan,
+                  DenseTensor* output) {
+    auto* in_a = in.data<T>();
+    auto* in_b = other.data<T>();
+    auto* out_data = ctx.template Alloc<bool>(output);
+    auto num = in.numel();
+    // *out_data = true;
+    for (int i = 0; i < num; i++) {
+      out_data[i] = true;
+    }
+    for (int i = 0; i < num; i++) {
+      const T a = in_a[i], b = in_b[i];
+      bool val;
+      if (std::isnan(a) || std::isnan(b)) {
+        val = equal_nan && std::isnan(a) == std::isnan(b);
+      } else {
+        T left = (a > b ? a - b : b - a);
+        T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
+        T diff = (left > right ? left - right : right - left);
+        val = a == b || left <= right || diff <= 1e-15;
+      }
+      // *out_data &= val;
+      out_data[i] = val;
+    }
+  }
+};
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+template <typename T>
+__global__ void IscloseCUDAKernel(const T* in_data,
+                                  const T* other_data,
+                                  const double rtol,
+                                  const double atol,
+                                  bool equal_nan,
+                                  int num,
+                                  bool* out_data) {
+  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  bool val;
+  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
+    const T a = in_data[i], b = other_data[i];
+    if (isnan(a) || isnan(b)) {
+      val = equal_nan && isnan(a) == isnan(b);
+    } else {
+      T left = (a > b ? a - b : b - a);
+      T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
+      T diff = (left > right ? left - right : right - left);
+      val = a == b || left <= right || diff <= 1e-15;
+    }
+    out_data[i] = val;
+    // if (!val) *out_data = false;
+  }
+}
+
+template <typename T>
+struct IscloseFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& dev_ctx,
+                  const DenseTensor& in,
+                  const DenseTensor& other,
+                  const double rtol,
+                  const double atol,
+                  bool equal_nan,
+                  DenseTensor* output) {
+    int num = in.numel();
+    const T* in_data = in.data<T>();
+    const T* other_data = other.data<T>();
+    bool* out_data = dev_ctx.template Alloc<bool>(output);
+    int block = 1024;
+    int grid = (block - 1 + num) / block;
+    grid = (grid > block) ? block : grid;
+#ifdef PADDLE_WITH_HIP
+    hipMemset(out_data, true, num * sizeof(bool));
+#else
+    cudaMemset(out_data, true, num * sizeof(bool));
+#endif
+    IscloseCUDAKernel<T><<<grid, block, 0, dev_ctx.stream()>>>(
+        in_data, other_data, rtol, atol, equal_nan, num, out_data);
+  }
+};
+#endif
+
+template <typename T, typename Context>
+void IscloseKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const Scalar& rtol,
+                   const Scalar& atol,
+                   bool equal_nan,
+                   DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      atol.dtype(),
+      DataType::FLOAT64,
+      phi::errors::InvalidArgument("Input(Atol) type must be double"));
+
+  PADDLE_ENFORCE_EQ(
+      rtol.dtype(),
+      DataType::FLOAT64,
+      phi::errors::InvalidArgument("Input(Rtol) type must be double"));
+
+  IscloseFunctor<Context, T>()(
+      dev_ctx, x, y, rtol.to<double>(), atol.to<double>(), equal_nan, out);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h b/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h
new file mode 100644
index 0000000000000..1ae90960ef445
--- /dev/null
+++ b/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+using Array1 = Eigen::DSizes<int64_t, 1>;
+template <typename T>
+struct KLDivLossBackward {
+  HOSTDEVICE KLDivLossBackward() {}
+
+  HOSTDEVICE T operator()(const T& target, const T& grad) const {
+    if (target <= 0) {
+      return 0;
+    } else {
+      return static_cast<T>(-1.) * grad;
+    }
+  }
+};
+
+template <typename T, typename Context>
+void KLDivLossGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& label,
+                         const DenseTensor& d_out,
+                         const std::string& reduction,
+                         DenseTensor* d_x) {
+  auto& place = *dev_ctx.eigen_device();
+  auto* target = &label;
+  auto* input_grad = d_x;
+  auto* loss_grad = &d_out;
+
+  const int n = input_grad->dims()[0];
+  const int numel = input_grad->numel();
+  const int expand = numel / loss_grad->numel();
+
+  dev_ctx.template Alloc<T>(input_grad);
+
+  auto target_t = phi::EigenVector<T>::Flatten(*target);
+
+  auto input_grad_t = phi::EigenVector<T>::Flatten(*input_grad);
+  auto loss_grad_t = phi::EigenVector<T>::Flatten(*loss_grad);
+
+  auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand));
+  auto grad_t = target_t * loss_grad_expand;
+  input_grad_t.device(place) =
+      target_t.binaryExpr(grad_t, KLDivLossBackward<T>());
+
+  if ("mean" == reduction) {
+    input_grad_t.device(place) = input_grad_t / static_cast<T>(numel);
+  } else if ("batchmean" == reduction) {
+    input_grad_t.device(place) = input_grad_t / static_cast<T>(n);
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h b/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h
new file mode 100644
index 0000000000000..ecd23bbfc1c45
--- /dev/null
+++ b/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+using Array1 = Eigen::DSizes<int64_t, 1>;
+template <typename T>
+struct KLDivLossForward {
+  HOSTDEVICE KLDivLossForward() {}
+
+  HOSTDEVICE T operator()(const T& target, const T& input) const {
+    if (target <= 0) {
+      return 0;
+    } else {
+      return target * (std::log(target) - input);
+    }
+  }
+};
+template <typename T, typename Context>
+void KLDivLossKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& label,
+                     const std::string& reduction,
+                     DenseTensor* out) {
+  auto& place = *(dev_ctx.eigen_device());
+  auto* input = &x;
+  auto* target = &label;
+  auto* loss = out;
+
+  const int n = input->dims()[0];
+  dev_ctx.template Alloc<T>(loss);
+
+  auto input_t = phi::EigenVector<T>::Flatten(*input);
+  auto target_t = phi::EigenVector<T>::Flatten(*target);
+  auto loss_t = phi::EigenVector<T>::Flatten(*loss);
+  auto output = target_t.binaryExpr(input_t, KLDivLossForward<T>());
+  if ("none" == reduction) {
+    loss_t.device(place) = output;
+  } else if ("batchmean" == reduction) {
+    auto output_sum = output.sum();
+    if (n > 0) {
+      loss_t.device(place) = output_sum / output_sum.constant(n);
+    } else {
+      loss_t.device(place) = output_sum;
+    }
+  } else if ("mean" == reduction) {
+    loss_t.device(place) = output.mean();
+  } else if ("sum" == reduction) {
+    loss_t.device(place) = output.sum();
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/kron_grad_kernel_impl.h b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
new file mode 100644
index 0000000000000..30297b53eabb9
--- /dev/null
+++ b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
@@ -0,0 +1,295 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/impl/kron_kernel_impl.h"
+
+namespace phi {
+
+template <typename T>
+struct KronGradElemFunctor {
+  KronGradElemFunctor(const T* dout,
+                      const T* A,
+                      const T* B,
+                      T* dout_a,
+                      T* dout_b,
+                      const int64_t* stride_dout,
+                      const int64_t* stride_a,
+                      const int64_t* stride_b,
+                      const int64_t* shape_b,
+                      const int64_t numel_a,
+                      const int64_t numel_b,
+                      const int ndims)
+      : dout_(dout),
+        A_(A),
+        B_(B),
+        dout_a_(dout_a),
+        dout_b_(dout_b),
+        stride_dout_(stride_dout),
+        stride_a_(stride_a),
+        stride_b_(stride_b),
+        shape_b_(shape_b),
+        numel_a_(numel_a),
+        numel_b_(numel_b),
+        ndims_(ndims) {}
+
+  HOSTDEVICE void operator()(int64_t idx) {
+    int64_t index = idx;
+    int64_t index_a = 0;
+    int64_t index_b = 0;
+    for (int i = 0; i < ndims_; i++) {
+      auto pos_i = index / stride_dout_[i];
+      index = index % stride_dout_[i];
+      auto pos_ai = pos_i / shape_b_[i];
+      auto pos_bi = pos_i % shape_b_[i];
+      index_a += stride_a_[i] * pos_ai;
+      index_b += stride_b_[i] * pos_bi;
+    }
+
+    if (dout_a_) {
+      size_t index_out_a = index_a * numel_b_ + index_b;
+      dout_a_[index_out_a] = dout_[idx] * B_[index_b];
+    }
+    if (dout_b_) {
+      size_t index_out_b = index_b * numel_a_ + index_a;
+      dout_b_[index_out_b] = dout_[idx] * A_[index_a];
+    }
+  }
+
+ private:
+  const T* dout_;
+  const T* A_;
+  const T* B_;
+  T* dout_a_;
+  T* dout_b_;
+  const int64_t* stride_dout_;
+  const int64_t* stride_a_;
+  const int64_t* stride_b_;
+  const int64_t* shape_b_;
+  const int64_t numel_a_;
+  const int64_t numel_b_;
+  const int ndims_;
+};
+
+template <typename T>
+struct KronGradElemFunctor<dtype::complex<T>> {
+  KronGradElemFunctor(const dtype::complex<T>* dout,
+                      const dtype::complex<T>* A,
+                      const dtype::complex<T>* B,
+                      dtype::complex<T>* dout_a,
+                      dtype::complex<T>* dout_b,
+                      const int64_t* stride_dout,
+                      const int64_t* stride_a,
+                      const int64_t* stride_b,
+                      const int64_t* shape_b,
+                      const int64_t numel_a,
+                      const int64_t numel_b,
+                      const int ndims)
+      : dout_(dout),
+        A_(A),
+        B_(B),
+        dout_a_(dout_a),
+        dout_b_(dout_b),
+        stride_dout_(stride_dout),
+        stride_a_(stride_a),
+        stride_b_(stride_b),
+        shape_b_(shape_b),
+        numel_a_(numel_a),
+        numel_b_(numel_b),
+        ndims_(ndims) {}
+
+  HOSTDEVICE void operator()(int64_t idx) {
+    int64_t index = idx;
+    int64_t index_a = 0;
+    int64_t index_b = 0;
+    for (int i = 0; i < ndims_; i++) {
+      auto pos_i = index / stride_dout_[i];
+      index = index % stride_dout_[i];
+      auto pos_ai = pos_i / shape_b_[i];
+      auto pos_bi = pos_i % shape_b_[i];
+      index_a += stride_a_[i] * pos_ai;
+      index_b += stride_b_[i] * pos_bi;
+    }
+
+    if (dout_a_) {
+      size_t index_out_a = index_a * numel_b_ + index_b;
+      dout_a_[index_out_a] =
+          dout_[idx] * dtype::complex<T>(B_[index_b].real, -B_[index_b].imag);
+    }
+    if (dout_b_) {
+      size_t index_out_b = index_b * numel_a_ + index_a;
+      dout_b_[index_out_b] =
+          dout_[idx] * dtype::complex<T>(A_[index_a].real, -A_[index_a].imag);
+    }
+  }
+
+ private:
+  const dtype::complex<T>* dout_;
+  const dtype::complex<T>* A_;
+  const dtype::complex<T>* B_;
+  dtype::complex<T>* dout_a_;
+  dtype::complex<T>* dout_b_;
+  const int64_t* stride_dout_;
+  const int64_t* stride_a_;
+  const int64_t* stride_b_;
+  const int64_t* shape_b_;
+  const int64_t numel_a_;
+  const int64_t numel_b_;
+  const int ndims_;
+};
+
+template <typename Context, typename T>
+struct KronGradOpFunctor {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& dout,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* dx,
+                  DenseTensor* dy) {
+    int ndims = dout.dims().size();
+    int64_t numel = dout.numel();
+    int64_t numel_x = x.numel();
+    int64_t numel_y = y.numel();
+
+    const phi::DDim& dim_x = x.dims();
+    const phi::DDim& dim_y = y.dims();
+    const phi::DDim& dim_dout = dout.dims();
+
+    const phi::DDim stride_x = phi::stride(dim_x);
+    const phi::DDim stride_y = phi::stride(dim_y);
+    const phi::DDim stride_dout = phi::stride(dim_dout);
+
+    const int64_t* p_stride_x = nullptr;
+    const int64_t* p_stride_y = nullptr;
+    const int64_t* p_stride_dout = nullptr;
+    const int64_t* p_shape_y = nullptr;
+#if defined(__NVCC__) || defined(__HIPCC__)
+    thrust::device_vector<int64_t> d_stride_x(ndims);
+    thrust::device_vector<int64_t> d_stride_y(ndims);
+    thrust::device_vector<int64_t> d_stride_dout(ndims);
+    thrust::device_vector<int64_t> d_shape_y(ndims);
+    thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin());
+    thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin());
+    thrust::copy(
+        stride_dout.Get(), stride_dout.Get() + ndims, d_stride_dout.begin());
+    thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin());
+
+    p_stride_x = thrust::raw_pointer_cast(d_stride_x.data());
+    p_stride_y = thrust::raw_pointer_cast(d_stride_y.data());
+    p_stride_dout = thrust::raw_pointer_cast(d_stride_dout.data());
+    p_shape_y = thrust::raw_pointer_cast(d_shape_y.data());
+#else
+    p_stride_x = stride_x.Get();
+    p_stride_y = stride_y.Get();
+    p_stride_dout = stride_dout.Get();
+    p_shape_y = dim_y.Get();
+#endif
+    // dout_x: dout * kron(ones(X), Y) re-aranged in shape (numel_x, numel_y)
+    // dout_y: dout * kron(X, ones(Y)) re-aranged in shaoe (numel_y, numel_x)
+    DenseTensor dout_x;
+    T* p_dout_x = nullptr;
+    if (dx) {
+      dout_x.Resize({numel_x, numel_y});
+      dev_ctx.template Alloc<T>(&dout_x);
+      p_dout_x = dout_x.data<T>();
+    }
+    DenseTensor dout_y;
+    T* p_dout_y = nullptr;
+    if (dy) {
+      dout_y.Resize({numel_y, numel_x});
+      dev_ctx.template Alloc<T>(&dout_y);
+      p_dout_y = dout_y.data<T>();
+    }
+
+    funcs::ForRange<Context> for_range(dev_ctx, numel);
+    KronGradElemFunctor<T> func(dout.data<T>(),
+                                x.data<T>(),
+                                y.data<T>(),
+                                p_dout_x,
+                                p_dout_y,
+                                p_stride_dout,
+                                p_stride_x,
+                                p_stride_y,
+                                p_shape_y,
+                                numel_x,
+                                numel_y,
+                                ndims);
+    for_range(func);
+
+// reduce_sum along aixs 1
+#if defined(__NVCC__) || defined(__HIPCC__)
+    auto stream = dev_ctx.stream();  // it is a cuda device_context
+    if (dx) {
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          dev_ctx, dout_x, dx, kps::IdentityFunctor<T>(), {1});
+    }
+    if (dy) {
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          dev_ctx, dout_y, dy, kps::IdentityFunctor<T>(), {1});
+    }
+#else
+    auto* place = dev_ctx.eigen_device();
+    Eigen::array<int, 1> reduce_dim = {1};
+    if (dx) {
+      auto eigen_dout_x = EigenMatrix<T>::Reshape(dout_x, 1);
+      auto eigen_vec_dx = EigenVector<T>::Flatten(*dx);
+      eigen_vec_dx.device(*place) = eigen_dout_x.sum(reduce_dim);
+    }
+    if (dy) {
+      auto eigen_dout_y = EigenMatrix<T>::Reshape(dout_y, 1);
+      auto eigen_vec_dy = EigenVector<T>::Flatten(*dy);
+      eigen_vec_dy.device(*place) = eigen_dout_y.sum(reduce_dim);
+    }
+#endif
+  }
+};
+
+template <typename T, typename Context>
+void KronGradKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const DenseTensor& out_grad,
+                    DenseTensor* x_grad,
+                    DenseTensor* y_grad) {
+  if (x_grad) {
+    ctx.template Alloc<T>(x_grad);
+  }
+  if (y_grad) {
+    ctx.template Alloc<T>(y_grad);
+  }
+
+  int ndims = out_grad.dims().size();
+  DenseTensor xx = UnsqueezeTo(x, ndims);
+  DenseTensor yy = UnsqueezeTo(y, ndims);
+
+  DenseTensor* pdxx = nullptr;
+  DenseTensor* pdyy = nullptr;
+  DenseTensor dxx;
+  DenseTensor dyy;
+  if (x_grad) {
+    dxx = UnsqueezeTo(*x_grad, ndims);
+    pdxx = &dxx;
+  }
+
+  if (y_grad) {
+    dyy = UnsqueezeTo(*y_grad, ndims);
+    pdyy = &dyy;
+  }
+
+  KronGradOpFunctor<Context, T> func;
+  func(ctx, out_grad, xx, yy, pdxx, pdyy);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/kron_kernel_impl.h b/paddle/phi/kernels/impl/kron_kernel_impl.h
new file mode 100644
index 0000000000000..47c76f59df23b
--- /dev/null
+++ b/paddle/phi/kernels/impl/kron_kernel_impl.h
@@ -0,0 +1,167 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "thrust/device_vector.h"
+#endif
+
+namespace phi {
+
+inline DenseTensor UnsqueezeTo(const DenseTensor& src, int ndims) {
+  const phi::DDim& shape = src.dims();
+  int rank = shape.size();
+  DenseTensor res;
+  res.ShareDataWith(src);
+  PADDLE_ENFORCE_LE(
+      rank,
+      ndims,
+      errors::InvalidArgument(
+          "The input Tensor's rank should be less than or equal to ndims"
+          "Received input Tensor's rank = %d, ndims = %d",
+          rank,
+          ndims));
+  if (rank < ndims) {
+    std::vector<int64_t> new_dim(ndims, 1);
+    for (int i = ndims - rank; i < ndims; i++) {
+      new_dim[i] = shape[i - ndims + rank];
+    }
+    res.Resize(phi::make_ddim(new_dim));
+  }
+  return res;
+}
+
+template <typename T>
+struct KronElemFunctor {
+  KronElemFunctor(const T* a,
+                  const T* b,
+                  T* out,
+                  const int64_t* shape_b,
+                  const int64_t* stride_a,
+                  const int64_t* stride_b,
+                  const int64_t* stride_out,
+                  int ndims)
+      : a_(a),
+        b_(b),
+        out_(out),
+        shape_b_(shape_b),
+        stride_a_(stride_a),
+        stride_b_(stride_b),
+        stride_out_(stride_out),
+        ndims_(ndims) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    // it computes 1 element in the output
+    int64_t index = idx;
+    int64_t index_a = 0;
+    int64_t index_b = 0;
+    for (int i = 0; i < ndims_; i++) {
+      auto pos_i = index / stride_out_[i];
+      index = index % stride_out_[i];
+      auto pos_ai = pos_i / shape_b_[i];
+      auto pos_bi = pos_i % shape_b_[i];
+      index_a += stride_a_[i] * pos_ai;
+      index_b += stride_b_[i] * pos_bi;
+    }
+    out_[idx] = a_[index_a] * b_[index_b];
+  }
+
+ private:
+  const T* a_;
+  const T* b_;
+  T* out_;
+  const int64_t* shape_b_;
+  const int64_t* stride_a_;
+  const int64_t* stride_b_;
+  const int64_t* stride_out_;
+  const int ndims_;
+};
+
+template <typename Context, typename T>
+struct KronOpFunctor {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* out) {
+    int ndims = out->dims().size();
+    int64_t numel = out->numel();
+
+    const phi::DDim& dim_x = x.dims();
+    const phi::DDim& dim_y = y.dims();
+    const phi::DDim& dim_out = out->dims();
+    const phi::DDim stride_x = phi::stride(dim_x);
+    const phi::DDim stride_y = phi::stride(dim_y);
+    const phi::DDim stride_out = phi::stride(dim_out);
+
+    const int64_t *p_stride_x = nullptr, *p_stride_y = nullptr,
+                  *p_stride_out = nullptr, *p_shape_y = nullptr;
+#if defined(__NVCC__) || defined(__HIPCC__)
+    thrust::device_vector<int64_t> d_stride_x(ndims);
+    thrust::device_vector<int64_t> d_stride_y(ndims);
+    thrust::device_vector<int64_t> d_stride_out(ndims);
+    thrust::device_vector<int64_t> d_shape_y(ndims);
+    thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin());
+    thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin());
+    thrust::copy(
+        stride_out.Get(), stride_out.Get() + ndims, d_stride_out.begin());
+    thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin());
+
+    p_stride_x = thrust::raw_pointer_cast(d_stride_x.data());
+    p_stride_y = thrust::raw_pointer_cast(d_stride_y.data());
+    p_stride_out = thrust::raw_pointer_cast(d_stride_out.data());
+    p_shape_y = thrust::raw_pointer_cast(d_shape_y.data());
+#else
+    p_stride_x = stride_x.Get();
+    p_stride_y = stride_y.Get();
+    p_stride_out = stride_out.Get();
+    p_shape_y = dim_y.Get();
+#endif
+
+    funcs::ForRange<Context> for_range(dev_ctx, numel);
+    KronElemFunctor<T> functor(x.data<T>(),
+                               y.data<T>(),
+                               out->data<T>(),
+                               p_shape_y,
+                               p_stride_x,
+                               p_stride_y,
+                               p_stride_out,
+                               ndims);
+    for_range(functor);
+  }
+};
+
+template <typename T, typename Context>
+void KronKernel(const Context& ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+
+  int ndims = out->dims().size();
+  DenseTensor xx = UnsqueezeTo(x, ndims);
+  DenseTensor yy = UnsqueezeTo(y, ndims);
+
+  KronOpFunctor<Context, T> func;
+  func(ctx, xx, yy, out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h
new file mode 100644
index 0000000000000..8fb1f1c4fa361
--- /dev/null
+++ b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <unsupported/Eigen/SpecialFunctions>
+#include "paddle/phi/kernels/funcs/for_range.h"
+namespace phi {
+template <typename T>
+struct LgammaGradFunctor {
+  LgammaGradFunctor(const T* dout, const T* x, T* output, int64_t numel)
+      : dout_(dout), x_(x), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    output_[idx] = dout_[idx] * Eigen::numext::digamma(x_[idx]);
+  }
+
+ private:
+  const T* dout_;
+  const T* x_;
+  T* output_;
+  int64_t numel_;
+};
+template <typename T, typename Context>
+void LgammaGradKernel(const Context& dev_ctx,
+                      const DenseTensor& d_out,
+                      const DenseTensor& x,
+                      DenseTensor* d_x) {
+  auto numel = d_out.numel();
+  auto* dout_data = d_out.data<T>();
+  auto* x_data = x.data<T>();
+  auto* dx_data =
+      dev_ctx.template Alloc<T>(d_x, static_cast<size_t>(numel * sizeof(T)));
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  LgammaGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
+  for_range(functor);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
new file mode 100644
index 0000000000000..e797b27071cac
--- /dev/null
+++ b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
@@ -0,0 +1,200 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+
+namespace phi {
+
+template <typename Context, typename T>
+void MatrixPowerGradFunction(const DenseTensor* X,
+                             const DenseTensor* Out,
+                             const DenseTensor* dOut,
+                             const int n,
+                             DenseTensor* dX,
+                             const Context& ctx) {
+  ctx.template Alloc<T>(dX);
+  const auto& x_dims = X->dims();
+
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  if (n == 0) {
+    // \nabla X = O
+    phi::funcs::SetConstant<Context, T> zero;
+    zero(ctx, dX, static_cast<T>(0));
+    return;
+  } else if (n == 1) {
+    // \nabla X = \nabla Out
+    paddle::framework::TensorCopy(*dOut, ctx.GetPlace(), ctx, dX);
+    return;
+  }
+
+  auto trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, true);
+  auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false);
+
+  if (n == -1) {
+    // \nabla X = Out^{T} * \nabla Out * Out^{T}
+    DenseTensor temp_dx;
+    temp_dx.Resize(X->dims());
+    ctx.template Alloc<T>(&temp_dx);
+    blas.MatMul(*Out,
+                trans_desc,
+                *dOut,
+                no_trans_desc,
+                static_cast<T>(-1),
+                &temp_dx,
+                static_cast<T>(0));
+    blas.MatMul(temp_dx,
+                no_trans_desc,
+                *Out,
+                trans_desc,
+                static_cast<T>(1),
+                dX,
+                static_cast<T>(0));
+    return;
+  }
+
+  DenseTensor new_x;
+  new_x.Resize(X->dims());
+  ctx.template Alloc<T>(&new_x);
+  int new_n = n;
+  if (n > 0) {
+    // newX = X
+    paddle::framework::TensorCopy(*X, ctx.GetPlace(), ctx, &new_x);
+  } else {
+    // newX = X^{-1}, n = -n
+    phi::funcs::MatrixInverseFunctor<Context, T> mat_inv;
+    mat_inv(ctx, *X, &new_x);
+    new_n = -n;
+  }
+
+  // Use chain rule blow to compute \nabla newX^{n}
+  // First, Get newX^{0}, newX^{1}, ..., newX^{n - 1},
+  // Note that newX^{0} can be omitted
+  std::vector<std::shared_ptr<DenseTensor>> tensor_list(new_n - 1);
+  tensor_list[0] = std::make_shared<DenseTensor>(new_x);
+  int index = 1;
+  while (index < new_n - 1) {
+    DenseTensor tensor_list_index;
+    tensor_list_index.Resize(X->dims());
+    ctx.template Alloc<T>(&tensor_list_index);
+    tensor_list[index] = std::make_shared<DenseTensor>(tensor_list_index);
+
+    blas.MatMul(*tensor_list[index - 1],
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                tensor_list[index].get(),
+                static_cast<T>(0));
+    index++;
+  }
+
+  // Second, \nabla newX = \sum_{i = 0}^{n - 1} (newX^{T}^{i}
+  //                      * \nabla Out
+  //                      * (newX^{T}^{n - i - 1})
+  DenseTensor dx_new;
+  dx_new.Resize(X->dims());
+  ctx.template Alloc<T>(&dx_new);
+  blas.MatMul(*tensor_list[new_n - 2],
+              trans_desc,
+              *dOut,
+              no_trans_desc,
+              static_cast<T>(1),
+              &dx_new,
+              static_cast<T>(0));
+  DenseTensor da_an_minus1;
+  da_an_minus1.Resize(X->dims());
+  ctx.template Alloc<T>(&da_an_minus1);
+  blas.MatMul(*dOut,
+              no_trans_desc,
+              *tensor_list[new_n - 2],
+              trans_desc,
+              static_cast<T>(1),
+              &da_an_minus1,
+              static_cast<T>(0));
+  blas.AXPY(
+      X->numel(), static_cast<T>(1), da_an_minus1.data<T>(), dx_new.data<T>());
+  int start = 0;
+  while (start < new_n - 2) {
+    DenseTensor a_da;
+    a_da.Resize(X->dims());
+    ctx.template Alloc<T>(&a_da);
+    DenseTensor a_da_a;
+    a_da_a.Resize(X->dims());
+    ctx.template Alloc<T>(&a_da_a);
+    blas.MatMul(*tensor_list[start],
+                trans_desc,
+                *dOut,
+                no_trans_desc,
+                static_cast<T>(1),
+                &a_da,
+                static_cast<T>(0));
+    blas.MatMul(a_da,
+                no_trans_desc,
+                *tensor_list[new_n - 3 - start],
+                trans_desc,
+                static_cast<T>(1),
+                &a_da_a,
+                static_cast<T>(0));
+    blas.AXPY(
+        X->numel(), static_cast<T>(1), a_da_a.data<T>(), dx_new.data<T>());
+    start++;
+  }
+
+  if (n > 0) {
+    // \nabla X = \nabla newX
+    paddle::framework::TensorCopy(dx_new, ctx.GetPlace(), ctx, dX);
+  } else {
+    // \nabla X = newX^{T} * \nabla newX * newX^{T}
+    DenseTensor temp_dx;
+    temp_dx.Resize(X->dims());
+    ctx.template Alloc<T>(&temp_dx);
+    blas.MatMul(new_x,
+                trans_desc,
+                dx_new,
+                no_trans_desc,
+                static_cast<T>(-1),
+                &temp_dx,
+                static_cast<T>(0));
+    blas.MatMul(temp_dx,
+                no_trans_desc,
+                new_x,
+                trans_desc,
+                static_cast<T>(1),
+                dX,
+                static_cast<T>(0));
+  }
+  return;
+}
+
+template <typename T, typename Context>
+void MatrixPowerGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& out,
+                           const DenseTensor& out_grad,
+                           int n,
+                           DenseTensor* x_grad) {
+  auto X = &x;
+  auto Out = &out;
+  auto dOut = &out_grad;
+  auto dX = x_grad;
+
+  MatrixPowerGradFunction<Context, T>(X, Out, dOut, n, dX, ctx);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
new file mode 100644
index 0000000000000..ccc5e8757e876
--- /dev/null
+++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
@@ -0,0 +1,203 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+
+namespace phi {
+
+template <typename T>
+struct IdentityMatrixFunctor {
+  IdentityMatrixFunctor(const int m, T* output) : m_(m), output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int row = index / m_ % m_;
+    const int col = index % m_;
+    output_[index] = col == row ? static_cast<T>(1) : static_cast<T>(0);
+  }
+
+  const int m_;
+  T* output_;
+};
+
+template <typename Context, typename T>
+void MatrixPowerFunction(const DenseTensor* X,
+                         const int n,
+                         DenseTensor* Out,
+                         const Context& ctx) {
+  const auto& x_dims = X->dims();
+  const int x_ndim = x_dims.size();
+  T* out_data = ctx.template Alloc<T>(Out);
+
+  phi::funcs::ForRange<Context> for_range(ctx, X->numel());
+
+  if (n == 0) {
+    // Out = Identity Matrix
+    IdentityMatrixFunctor<T> functor(x_dims[x_ndim - 1], out_data);
+    for_range(functor);
+    return;
+  }
+
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  DenseTensor new_x;
+  new_x.Resize(X->dims());
+  ctx.template Alloc<T>(&new_x);
+  int new_n = n;
+  if (n > 0) {
+    // newX = X
+    paddle::framework::TensorCopy(*X, ctx.GetPlace(), ctx, &new_x);
+  } else {
+    // newX = X^{-1}, n = -n
+    phi::funcs::MatrixInverseFunctor<Context, T> mat_inv;
+    mat_inv(ctx, *X, &new_x);
+    new_n = -n;
+  }
+
+  if (new_n == 1) {
+    paddle::framework::TensorCopy(new_x, ctx.GetPlace(), ctx, Out);
+    return;
+  }
+
+  auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false);
+
+  if (new_n == 2) {
+    // Out = newX * newX
+    ctx.template Alloc<T>(Out);
+    blas.MatMul(new_x,
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                Out,
+                static_cast<T>(0));
+    return;
+  } else if (new_n == 3) {
+    // Out = (newX * newX) * newX
+    // Note: C[i] matrices in MatMul must not overlap, i.e. the individual
+    // gemm operations must be computable independently; otherwise,
+    // undefined behavior is expected.
+    DenseTensor temp;
+    temp.Resize(X->dims());
+    ctx.template Alloc<T>(&temp);
+    blas.MatMul(new_x,
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                &temp,
+                static_cast<T>(0));
+    blas.MatMul(temp,
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                Out,
+                static_cast<T>(0));
+    return;
+  } else if (new_n == 4) {
+    // Out = (newX * newX) * (newX * newX)
+    DenseTensor temp;
+    temp.Resize(X->dims());
+    ctx.template Alloc<T>(&temp);
+    blas.MatMul(new_x,
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                &temp,
+                static_cast<T>(0));
+    blas.MatMul(temp,
+                no_trans_desc,
+                temp,
+                no_trans_desc,
+                static_cast<T>(1),
+                Out,
+                static_cast<T>(0));
+    return;
+  }
+
+  // Calculate Out = newX^{n} for abs(n) > 4 with time complexity as O(logN)
+  int bit = 0;
+  DenseTensor z = DenseTensor(X->dtype());
+  bool out_inited = false;
+  DenseTensor temp_out;
+  temp_out.Resize(X->dims());
+  ctx.template Alloc<T>(&temp_out);
+  DenseTensor temp_z;
+  temp_z.Resize(X->dims());
+  ctx.template Alloc<T>(&temp_z);
+  while (new_n > 0) {
+    bit = new_n & 0x1;
+    new_n >>= 1;
+    if (z.IsInitialized()) {
+      blas.MatMul(z,
+                  no_trans_desc,
+                  z,
+                  no_trans_desc,
+                  static_cast<T>(1),
+                  &temp_z,
+                  static_cast<T>(0));
+      paddle::framework::TensorCopy(temp_z, ctx.GetPlace(), ctx, &z);
+    } else {
+      z.Resize(X->dims());
+      ctx.template Alloc<T>(&z);
+      paddle::framework::TensorCopy(new_x, ctx.GetPlace(), ctx, &z);
+    }
+    if (bit == 1) {
+      if (out_inited == true) {
+        blas.MatMul(*Out,
+                    no_trans_desc,
+                    z,
+                    no_trans_desc,
+                    static_cast<T>(1),
+                    &temp_out,
+                    static_cast<T>(0));
+        paddle::framework::TensorCopy(temp_out, ctx.GetPlace(), ctx, Out);
+      } else {
+        paddle::framework::TensorCopy(z, ctx.GetPlace(), ctx, Out);
+        out_inited = true;
+      }
+    }
+  }
+  return;
+}
+
+template <typename T, typename Context>
+void MatrixPowerKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       int n,
+                       DenseTensor* out) {
+  const DenseTensor* X = &x;
+  auto Out = out;
+
+  const auto& x_dims = X->dims();
+  const int x_ndim = x_dims.size();
+  PADDLE_ENFORCE_EQ(
+      x_dims[x_ndim - 2],
+      x_dims[x_ndim - 1],
+      errors::InvalidArgument(
+          "The inner-most 2 dimensions of Input(X) should be equal."
+          "X's shape[-2] = %d and shape[-1] = %d.",
+          x_dims[x_ndim - 2],
+          x_dims[x_ndim - 1]));
+
+  MatrixPowerFunction<Context, T>(X, n, Out, ctx);
+}
+
+}  // namespace phi
diff --git a/paddle/fluid/operators/matrix_rank_op.h b/paddle/phi/kernels/impl/matrix_rank_kernel_impl.h
similarity index 72%
rename from paddle/fluid/operators/matrix_rank_op.h
rename to paddle/phi/kernels/impl/matrix_rank_kernel_impl.h
index 93545fd31037a..b0dd76a17eeb3 100644
--- a/paddle/fluid/operators/matrix_rank_op.h
+++ b/paddle/phi/kernels/impl/matrix_rank_kernel_impl.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,14 +13,11 @@
 // limitations under the License.
 
 #pragma once
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/phi/core/ddim.h"
 
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/matrix_rank_kernel.h"
+
+namespace phi {
 
 namespace detail {
 static DDim GetEigenvalueDim(const DDim& dim, int k) {
@@ -44,6 +41,18 @@ static DDim RemoveLastDim(const DDim& dim) {
   vec.erase(vec.end() - 1, vec.end());
   return phi::make_ddim(vec);
 }
+
+static DDim GetUDDim(const DDim& x_dim, int k) {
+  auto x_vec = phi::vectorize(x_dim);
+  x_vec[x_vec.size() - 1] = k;
+  return phi::make_ddim(x_vec);
+}
+
+static DDim GetVHDDim(const DDim& x_dim, int k) {
+  auto x_vec = phi::vectorize(x_dim);
+  x_vec[x_vec.size() - 2] = k;
+  return phi::make_ddim(x_vec);
+}
 }  // namespace detail
 
 template <typename T>
@@ -57,5 +66,4 @@ struct GreaterElementFunctor {
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
new file mode 100644
index 0000000000000..7fe89ce34c8b5
--- /dev/null
+++ b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
@@ -0,0 +1,332 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/pool_grad_kernel.h"
+
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/pool_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PoolGradRawKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& out,
+                       const DenseTensor& dout,
+                       const std::vector<int>& kernel_size,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       bool exclusive,
+                       const std::string& data_format,
+                       const std::string& pooling_type,
+                       bool global_pooling,
+                       bool adaptive,
+                       const std::string& padding_algorithm,
+                       DenseTensor* dx) {
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  // update paddings
+  auto x_dims = x.dims();
+  DDim data_dims;
+  if (channel_last) {
+    data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  } else {
+    data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  }
+  funcs::UpdatePadding(&paddings_,
+                       global_pooling,
+                       adaptive,
+                       padding_algorithm,
+                       data_dims,
+                       strides,
+                       kernel_size_);
+  if (data_dims.size() * 2 == static_cast<int>(paddings_.size())) {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      paddings_.erase(paddings_.begin() + i + 1);
+    }
+  }
+
+  if (global_pooling) {
+    funcs::UpdateKernelSize(&kernel_size_, data_dims);
+  }
+
+  if (dx) {
+    ctx.template Alloc<T>(dx);
+    funcs::SetConstant<Context, T> set_constant;
+    set_constant(ctx, dx, static_cast<T>(0.0));
+
+    switch (kernel_size_.size()) {
+      case 2: {
+        if (pooling_type == "max") {
+          funcs::MaxPool2dGradFunctor<Context, T> pool2d_backward;
+          pool2d_backward(ctx,
+                          x,
+                          out,
+                          dout,
+                          kernel_size_,
+                          strides,
+                          paddings_,
+                          data_format,
+                          dx);
+        } else if (pooling_type == "avg") {
+          funcs::Pool2dGradFunctor<Context, funcs::AvgPoolGrad<T>, T>
+              pool2d_backward;
+          funcs::AvgPoolGrad<T> pool_process;
+          pool2d_backward(ctx,
+                          x,
+                          out,
+                          dout,
+                          kernel_size_,
+                          strides,
+                          paddings_,
+                          data_format,
+                          exclusive,
+                          adaptive,
+                          dx,
+                          pool_process);
+        }
+      } break;
+      case 3: {
+        if (pooling_type == "max") {
+          funcs::MaxPool3dGradFunctor<Context, T> pool3d_backward;
+          pool3d_backward(ctx,
+                          x,
+                          out,
+                          dout,
+                          kernel_size_,
+                          strides,
+                          paddings_,
+                          data_format,
+                          dx);
+        } else if (pooling_type == "avg") {
+          funcs::Pool3dGradFunctor<Context, funcs::AvgPoolGrad<T>, T>
+              pool3d_backward;
+          funcs::AvgPoolGrad<T> pool_process;
+          pool3d_backward(ctx,
+                          x,
+                          out,
+                          dout,
+                          kernel_size_,
+                          strides,
+                          paddings_,
+                          data_format,
+                          exclusive,
+                          adaptive,
+                          dx,
+                          pool_process);
+        }
+      } break;
+      default: {
+        PADDLE_THROW(
+            errors::InvalidArgument("Pool op only supports 2D and 3D input."));
+      }
+    }
+  }
+}
+
+template <typename Context, typename T1, typename T2 = int>
+void MaxPoolWithIndexGradRawKernel(const Context& ctx,
+                                   const DenseTensor& x,
+                                   const DenseTensor& mask,
+                                   const DenseTensor& dout,
+                                   const std::vector<int>& kernel_size,
+                                   const std::vector<int>& strides,
+                                   const std::vector<int>& paddings,
+                                   bool global_pooling,
+                                   bool adaptive,
+                                   DenseTensor* dx) {
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  if (global_pooling) {
+    for (size_t i = 0; i < kernel_size_.size(); ++i) {
+      paddings_[i] = 0;
+      kernel_size_[i] = static_cast<int>(dx->dims()[i + 2]);
+    }
+  }
+
+  if (dx) {
+    ctx.template Alloc<T1>(dx);
+    funcs::set_constant(ctx, dx, 0);
+
+    switch (kernel_size_.size()) {
+      case 2: {
+        funcs::MaxPool2dWithIndexGradFunctor<Context, T1, T2> pool2d_backward;
+        pool2d_backward(
+            ctx, dout, mask, kernel_size_, strides, paddings_, adaptive, dx);
+      } break;
+      case 3: {
+        funcs::MaxPool3dWithIndexGradFunctor<Context, T1, T2> pool3d_backward;
+        pool3d_backward(
+            ctx, dout, mask, kernel_size_, strides, paddings_, adaptive, dx);
+      } break;
+      default: {
+        PADDLE_THROW(
+            errors::InvalidArgument("Pool op only supports 2D and 3D input."));
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Pool2dGradKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      const std::vector<int>& kernel_size,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      bool ceil_mode,
+                      bool exclusive,
+                      const std::string& data_format,
+                      const std::string& pooling_type,
+                      bool global_pooling,
+                      bool adaptive,
+                      const std::string& padding_algorithm,
+                      DenseTensor* dx) {
+  PoolGradRawKernel<T, Context>(ctx,
+                                x,
+                                out,
+                                dout,
+                                kernel_size,
+                                strides,
+                                paddings,
+                                exclusive,
+                                data_format,
+                                pooling_type,
+                                global_pooling,
+                                adaptive,
+                                padding_algorithm,
+                                dx);
+}
+
+template <typename T, typename Context>
+void Pool2dDoubleGradKernel(const Context& ctx,
+                            const DenseTensor& x,
+                            const std::vector<int>& kernel_size,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            bool ceil_mode,
+                            bool exclusive,
+                            const std::string& data_format,
+                            const std::string& pooling_type,
+                            bool global_pooling,
+                            bool adaptive,
+                            const std::string& padding_algorithm,
+                            DenseTensor* out) {
+  if (pooling_type == "max") {
+    PADDLE_THROW(
+        errors::InvalidArgument("Pool op grad grad only supports avgpool."));
+  } else {
+    Pool2dKernel<T, Context>(ctx,
+                             x,
+                             kernel_size,
+                             strides,
+                             paddings,
+                             ceil_mode,
+                             exclusive,
+                             data_format,
+                             pooling_type,
+                             global_pooling,
+                             adaptive,
+                             padding_algorithm,
+                             out);
+  }
+}
+
+template <typename T, typename Context>
+void MaxPool2dWithIndexGradKernel(const Context& ctx,
+                                  const DenseTensor& x,
+                                  const DenseTensor& mask,
+                                  const DenseTensor& dout,
+                                  const std::vector<int>& kernel_size,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  bool global_pooling,
+                                  bool adaptive,
+                                  DenseTensor* dx) {
+  MaxPoolWithIndexGradRawKernel<Context, T>(ctx,
+                                            x,
+                                            mask,
+                                            dout,
+                                            kernel_size,
+                                            strides,
+                                            paddings,
+                                            global_pooling,
+                                            adaptive,
+                                            dx);
+}
+
+template <typename T, typename Context>
+void Pool3dGradKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      const std::vector<int>& kernel_size,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      bool ceil_mode,
+                      bool exclusive,
+                      const std::string& data_format,
+                      const std::string& pooling_type,
+                      bool global_pooling,
+                      bool adaptive,
+                      const std::string& padding_algorithm,
+                      DenseTensor* dx) {
+  PoolGradRawKernel<T, Context>(ctx,
+                                x,
+                                out,
+                                dout,
+                                kernel_size,
+                                strides,
+                                paddings,
+                                exclusive,
+                                data_format,
+                                pooling_type,
+                                global_pooling,
+                                adaptive,
+                                padding_algorithm,
+                                dx);
+}
+
+template <typename T, typename Context>
+void MaxPool3dWithIndexGradKernel(const Context& ctx,
+                                  const DenseTensor& x,
+                                  const DenseTensor& mask,
+                                  const DenseTensor& dout,
+                                  const std::vector<int>& kernel_size,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  bool global_pooling,
+                                  bool adaptive,
+                                  DenseTensor* dx) {
+  MaxPoolWithIndexGradRawKernel<Context, T>(ctx,
+                                            x,
+                                            mask,
+                                            dout,
+                                            kernel_size,
+                                            strides,
+                                            paddings,
+                                            global_pooling,
+                                            adaptive,
+                                            dx);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/pool_kernel_impl.h b/paddle/phi/kernels/impl/pool_kernel_impl.h
new file mode 100644
index 0000000000000..665d02fd0173e
--- /dev/null
+++ b/paddle/phi/kernels/impl/pool_kernel_impl.h
@@ -0,0 +1,321 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/pool_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+
+#if defined(__HIPCC__) || defined(__NVCC__)
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+#endif
+
+namespace phi {
+
+inline int GetReduceNum(const DenseTensor& input,
+                        const DenseTensor* output,
+                        const std::string data_format,
+                        std::vector<int>* reduce_dim) {
+  // data_format only can be NCHW
+  bool channel_last = (data_format == "NHWC");
+  if (channel_last) {
+    return 0;
+  }
+  int reduce_num = 0;
+  const int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  if ((output_height == 1) && (output_width == 1)) {
+    reduce_dim->push_back(2);
+    reduce_dim->push_back(3);
+    reduce_num = input.dims()[2] * input.dims()[3];
+  }
+  return reduce_num;
+}
+
+template <typename T, typename Context>
+void PoolRawKernel(const Context& ctx,
+                   const DenseTensor& x,
+                   const std::vector<int>& kernel_size,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings,
+                   bool exclusive,
+                   const std::string& data_format,
+                   const std::string& pooling_type,
+                   bool global_pooling,
+                   bool adaptive,
+                   const std::string& padding_algorithm,
+                   DenseTensor* out) {
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  // update paddings
+  auto x_dims = x.dims();
+  DDim data_dims;
+  if (channel_last) {
+    data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  } else {
+    data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  }
+
+  funcs::UpdatePadding(&paddings_,
+                       global_pooling,
+                       adaptive,
+                       padding_algorithm,
+                       data_dims,
+                       strides,
+                       kernel_size_);
+
+  if (data_dims.size() * 2 == static_cast<int>(paddings_.size())) {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      paddings_.erase(paddings_.begin() + i + 1);
+    }
+  }
+
+  if (global_pooling) {
+    funcs::UpdateKernelSize(&kernel_size_, data_dims);
+  }
+
+  switch (kernel_size_.size()) {
+    case 2: {
+      if (pooling_type == "max") {
+        funcs::Pool2dFunctor<Context, funcs::MaxPool<T>, T> pool2d_forward;
+        funcs::MaxPool<T> pool_process;
+        pool2d_forward(ctx,
+                       x,
+                       kernel_size_,
+                       strides,
+                       paddings_,
+                       data_format,
+                       true,
+                       false,
+                       out,
+                       pool_process);
+
+      } else if (pooling_type == "avg") {
+        std::vector<int> reduce_dim;
+        int reduce_num = GetReduceNum(x, out, data_format, &reduce_dim);
+        if (reduce_num > 0 &&
+            adaptive) {  // for adaptive_avg_pool2d && output_size == 1
+#if defined(__HIPCC__) || defined(__NVCC__)
+          auto stream = ctx.stream();
+          funcs::ReduceKernel<T, T, kps::AddFunctor, kps::DivideFunctor<T>>(
+              ctx, x, out, kps::DivideFunctor<T>(reduce_num), reduce_dim);
+#else  // for cpu
+          funcs::Pool2dFunctor<Context, funcs::AvgPool<T>, T> pool2d_forward;
+          funcs::AvgPool<T> pool_process;
+          pool2d_forward(ctx,
+                         x,
+                         kernel_size_,
+                         strides,
+                         paddings_,
+                         data_format,
+                         exclusive,
+                         adaptive,
+                         out,
+                         pool_process);
+#endif
+        } else {  // avgpool_2d or  adaptive_avg_pool2d && output_size != 1
+          funcs::Pool2dFunctor<Context, funcs::AvgPool<T>, T> pool2d_forward;
+          funcs::AvgPool<T> pool_process;
+          pool2d_forward(ctx,
+                         x,
+                         kernel_size_,
+                         strides,
+                         paddings_,
+                         data_format,
+                         exclusive,
+                         adaptive,
+                         out,
+                         pool_process);
+        }
+      }
+    } break;
+    case 3: {
+      if (pooling_type == "max") {
+        funcs::Pool3dFunctor<Context, funcs::MaxPool<T>, T> pool3d_forward;
+        funcs::MaxPool<T> pool_process;
+        pool3d_forward(ctx,
+                       x,
+                       kernel_size_,
+                       strides,
+                       paddings_,
+                       data_format,
+                       true,
+                       false,
+                       out,
+                       pool_process);
+      } else if (pooling_type == "avg") {
+        funcs::Pool3dFunctor<Context, funcs::AvgPool<T>, T> pool3d_forward;
+        funcs::AvgPool<T> pool_process;
+        pool3d_forward(ctx,
+                       x,
+                       kernel_size_,
+                       strides,
+                       paddings_,
+                       data_format,
+                       exclusive,
+                       adaptive,
+                       out,
+                       pool_process);
+      }
+    } break;
+    default: {
+      PADDLE_THROW(
+          errors::InvalidArgument("Pool op only supports 2D and 3D input."));
+    }
+  }
+}
+
+template <typename Context, typename T1, typename T2 = int>
+void MaxPoolWithIndexRawKernel(const Context& ctx,
+                               const DenseTensor& x,
+                               const std::vector<int>& kernel_size,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& paddings,
+                               bool global_pooling,
+                               bool adaptive,
+                               DenseTensor* out,
+                               DenseTensor* mask) {
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  if (global_pooling) {
+    for (size_t i = 0; i < kernel_size_.size(); ++i) {
+      paddings_[i] = 0;
+      kernel_size_[i] = static_cast<int>(x.dims()[i + 2]);
+    }
+  }
+
+  switch (kernel_size_.size()) {
+    case 2: {
+      funcs::MaxPool2dWithIndexFunctor<Context, T1, T2> pool2d_forward;
+      pool2d_forward(
+          ctx, x, kernel_size_, strides, paddings_, adaptive, out, mask);
+    } break;
+    case 3: {
+      funcs::MaxPool3dWithIndexFunctor<Context, T1, T2> pool3d_forward;
+      pool3d_forward(
+          ctx, x, kernel_size_, strides, paddings_, adaptive, out, mask);
+    } break;
+    default: {
+      PADDLE_THROW(
+          errors::InvalidArgument("Pool op only supports 2D and 3D input."));
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Pool2dKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int>& kernel_size,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool ceil_mode,
+                  bool exclusive,
+                  const std::string& data_format,
+                  const std::string& pooling_type,
+                  bool global_pooling,
+                  bool adaptive,
+                  const std::string& padding_algorithm,
+                  DenseTensor* out) {
+  PoolRawKernel<T, Context>(ctx,
+                            x,
+                            kernel_size,
+                            strides,
+                            paddings,
+                            exclusive,
+                            data_format,
+                            pooling_type,
+                            global_pooling,
+                            adaptive,
+                            padding_algorithm,
+                            out);
+}
+
+template <typename T, typename Context>
+void MaxPool2dWithIndexKernel(const Context& ctx,
+                              const DenseTensor& x,
+                              const std::vector<int>& kernel_size,
+                              const std::vector<int>& strides,
+                              const std::vector<int>& paddings,
+                              bool global_pooling,
+                              bool adaptive,
+                              DenseTensor* out,
+                              DenseTensor* mask) {
+  MaxPoolWithIndexRawKernel<Context, T>(ctx,
+                                        x,
+                                        kernel_size,
+                                        strides,
+                                        paddings,
+                                        global_pooling,
+                                        adaptive,
+                                        out,
+                                        mask);
+}
+
+template <typename T, typename Context>
+void Pool3dKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int>& kernel_size,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool ceil_mode,
+                  bool exclusive,
+                  const std::string& data_format,
+                  const std::string& pooling_type,
+                  bool global_pooling,
+                  bool adaptive,
+                  const std::string& padding_algorithm,
+                  DenseTensor* out) {
+  PoolRawKernel<T, Context>(ctx,
+                            x,
+                            kernel_size,
+                            strides,
+                            paddings,
+                            exclusive,
+                            data_format,
+                            pooling_type,
+                            global_pooling,
+                            adaptive,
+                            padding_algorithm,
+                            out);
+}
+
+template <typename T, typename Context>
+void MaxPool3dWithIndexKernel(const Context& ctx,
+                              const DenseTensor& x,
+                              const std::vector<int>& kernel_size,
+                              const std::vector<int>& strides,
+                              const std::vector<int>& paddings,
+                              bool global_pooling,
+                              bool adaptive,
+                              DenseTensor* out,
+                              DenseTensor* mask) {
+  MaxPoolWithIndexRawKernel<Context, T>(ctx,
+                                        x,
+                                        kernel_size,
+                                        strides,
+                                        paddings,
+                                        global_pooling,
+                                        adaptive,
+                                        out,
+                                        mask);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/reduce_grad.h b/paddle/phi/kernels/impl/reduce_grad.h
new file mode 100644
index 0000000000000..f56d3d3ed50f7
--- /dev/null
+++ b/paddle/phi/kernels/impl/reduce_grad.h
@@ -0,0 +1,132 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/reduce_grad_functions.h"
+
+namespace phi {
+
+template <typename Context,
+          typename T,
+          typename Functor,
+          bool kNoNeedBufferX = false,
+          bool kNoNeedBufferY = false>
+void ComputeFromInput(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const paddle::optional<DenseTensor>& out,
+                      const DenseTensor& input2,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DataType in_dtype,
+                      DataType out_dtype,
+                      DenseTensor* x_grad) {
+  auto* input0 = &x;
+  auto* input1 = out.get_ptr();
+  auto* output = x_grad;
+  dev_ctx.template Alloc<T>(output);
+
+  // The dims has full dim, set the reduce_all is True
+  const auto& input_dim_size = x.dims().size();
+  std::set<int> dims_set(dims.begin(), dims.end());
+  bool full_dim = true;
+  for (auto i = 0; i < input_dim_size; i++) {
+    if (dims_set.find(i) == dims_set.end()) {
+      full_dim = false;
+      break;
+    }
+  }
+  reduce_all = (reduce_all || full_dim);
+  // NOTE: EigenTensor::From() uses tensor->data()
+  // if op has NoNeedBufferVarsInferer, the corresponding kNoNeedBufferX or
+  // kNoNeedBufferY should set true
+  // and use fake var that has same dims.
+  if (kNoNeedBufferX) {
+    input0 = output;
+  }
+  if (kNoNeedBufferY) {
+    input1 = &input2;
+  }
+
+  const std::vector<int> const_dims{dims.begin(), dims.end()};
+
+  // NOTE(dengkaipeng): Out is unnecessary in some reduce kernel and
+  // not be set as Input in grad Maker, use Out_grad to replace here
+  if (!input1) input1 = &input2;
+  Functor functor;
+
+  funcs::LaunchReduceGradKernel<Context, T, Functor>(dev_ctx,
+                                                     input0,
+                                                     input1,
+                                                     &input2,
+                                                     output,
+                                                     functor,
+                                                     const_dims,
+                                                     reduce_all);
+}
+
+template <typename Context,
+          typename T,
+          typename Functor,
+          bool kNoNeedBufferX = false,
+          bool kNoNeedBufferY = false>
+void ReduceGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const paddle::optional<DenseTensor>& out,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DataType in_dtype,
+                      DataType out_dtype,
+                      DenseTensor* x_grad) {
+  if (in_dtype != DataType::UNDEFINED) {
+    DenseTensorMeta x_grad_meta(out_dtype, x_grad->dims(), x_grad->layout());
+    DenseTensor x_grad_tmp =
+        phi::Empty<Context>(dev_ctx, std::move(x_grad_meta));
+    ComputeFromInput<Context, T, Functor, kNoNeedBufferX, kNoNeedBufferY>(
+        dev_ctx,
+        x,
+        out_grad,
+        out,
+        out_grad,
+        dims,
+        keep_dim,
+        reduce_all,
+        in_dtype,
+        out_dtype,
+        &x_grad_tmp);
+
+    phi::CastKernel<T>(dev_ctx, x_grad_tmp, in_dtype, x_grad);
+  } else {
+    ComputeFromInput<Context, T, Functor, kNoNeedBufferX, kNoNeedBufferY>(
+        dev_ctx,
+        x,
+        out_grad,
+        out,
+        out_grad,
+        dims,
+        keep_dim,
+        reduce_all,
+        in_dtype,
+        out_dtype,
+        x_grad);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h
new file mode 100644
index 0000000000000..4a74416e39164
--- /dev/null
+++ b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/reduce_grad_kernel.h"
+
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/reduce_grad.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceMaxGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const DenseTensor& out,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad) {
+  ReduceGradKernel<Context, T, funcs::MaxOrMinGradFunctor>(dev_ctx,
+                                                           x,
+                                                           out_grad,
+                                                           out,
+                                                           dims,
+                                                           keep_dim,
+                                                           reduce_all,
+                                                           in_dtype,
+                                                           out_dtype,
+                                                           x_grad);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h
new file mode 100644
index 0000000000000..baaa544f13736
--- /dev/null
+++ b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/reduce_grad_kernel.h"
+
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/reduce_grad.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceMinGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const DenseTensor& out,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad) {
+  ReduceGradKernel<Context, T, funcs::MaxOrMinGradFunctor>(dev_ctx,
+                                                           x,
+                                                           out_grad,
+                                                           out,
+                                                           dims,
+                                                           keep_dim,
+                                                           reduce_all,
+                                                           in_dtype,
+                                                           out_dtype,
+                                                           x_grad);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h
new file mode 100644
index 0000000000000..6b93e98cec016
--- /dev/null
+++ b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/reduce_grad_kernel.h"
+
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/reduce_grad.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceProdGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& out,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DataType in_dtype,
+                          DataType out_dtype,
+                          DenseTensor* x_grad) {
+  ReduceGradKernel<Context, T, funcs::ProdGradFunctor>(dev_ctx,
+                                                       x,
+                                                       out_grad,
+                                                       out,
+                                                       dims,
+                                                       keep_dim,
+                                                       reduce_all,
+                                                       in_dtype,
+                                                       out_dtype,
+                                                       x_grad);
+}
+
+}  // namespace phi
diff --git a/paddle/fluid/operators/searchsorted_op.h b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
similarity index 58%
rename from paddle/fluid/operators/searchsorted_op.h
rename to paddle/phi/kernels/impl/searchsorted_kernel_impl.h
index 6aa38a8158132..82bd9fba2a66d 100644
--- a/paddle/fluid/operators/searchsorted_op.h
+++ b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -16,16 +16,11 @@
 
 #include <math.h>
 
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/algorithm.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
+namespace phi {
 
 template <typename T1, typename T2, typename OutType>
 class GpuAndCpuSearchSortedCompute {
@@ -65,9 +60,11 @@ class GpuAndCpuSearchSortedCompute {
   static HOSTDEVICE bool IsInf(int64_t x) { return false; }
 
   HOSTDEVICE GpuAndCpuSearchSortedCompute(const T1* sequence_data,
-                                          const T2* value_data, bool right,
+                                          const T2* value_data,
+                                          bool right,
                                           bool is_1d_boundaries,
-                                          int64_t val_size, int64_t seq_size,
+                                          int64_t val_size,
+                                          int64_t seq_size,
                                           OutType* out_data)
       : sequence_data_(sequence_data),
         value_data_(value_data),
@@ -104,12 +101,13 @@ class GpuAndCpuSearchSortedCompute {
   OutType* out_data_;
 };
 
-template <typename DeviceContext, typename T1, typename OutType>
+template <typename Context, typename T1, typename OutType>
 class SearchSortedFunctor {
  public:
-  SearchSortedFunctor(const framework::ExecutionContext& context,
-                      const framework::Tensor* sorted_sequence,
-                      const framework::Tensor* value, bool right,
+  SearchSortedFunctor(const Context& context,
+                      const DenseTensor* sorted_sequence,
+                      const DenseTensor* value,
+                      bool right,
                       OutType* out_data)
       : context_(context),
         sorted_sequence_(sorted_sequence),
@@ -121,74 +119,73 @@ class SearchSortedFunctor {
   void apply() {
     const T1* sequence_data = sorted_sequence_->data<T1>();
     const T2* value_data = value_->data<T2>();
-    const framework::DDim& seq_dims = sorted_sequence_->dims();
-    const framework::DDim& val_dims = value_->dims();
+    const phi::DDim& seq_dims = sorted_sequence_->dims();
+    const phi::DDim& val_dims = value_->dims();
 
     bool is_1d_boundaries = seq_dims.size() == 1;
     int64_t val_size = val_dims[val_dims.size() - 1];
     int64_t seq_size = seq_dims[seq_dims.size() - 1];
 
-    auto& dev_ctx = context_.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, value_->numel());
+    funcs::ForRange<Context> for_range(context_, value_->numel());
     GpuAndCpuSearchSortedCompute<T1, T2, OutType>
-        gpu_and_cpu_search_sorted_compute(sequence_data, value_data, right_,
-                                          is_1d_boundaries, val_size, seq_size,
+        gpu_and_cpu_search_sorted_compute(sequence_data,
+                                          value_data,
+                                          right_,
+                                          is_1d_boundaries,
+                                          val_size,
+                                          seq_size,
                                           out_data_);
     for_range(gpu_and_cpu_search_sorted_compute);
   }
 
  private:
-  const framework::ExecutionContext& context_;
-  const framework::Tensor* sorted_sequence_;
-  const framework::Tensor* value_;
+  const Context& context_;
+  const DenseTensor* sorted_sequence_;
+  const DenseTensor* value_;
   bool right_;
   OutType* out_data_;
 };
 
 template <typename Visitor>
-static void VisitDataType(framework::proto::VarType::Type type,
-                          Visitor visitor) {
-  if (type == framework::proto::VarType::FP32) {
+static void VisitDataType(DataType type, Visitor visitor) {
+  if (type == DataType::FLOAT32) {
     visitor.template apply<float>();
-  } else if (type == framework::proto::VarType::FP64) {
+  } else if (type == DataType::FLOAT64) {
     visitor.template apply<double>();
-  } else if (type == framework::proto::VarType::INT32) {
+  } else if (type == DataType::INT32) {
     visitor.template apply<int>();
-  } else if (type == framework::proto::VarType::INT64) {
+  } else if (type == DataType::INT64) {
     visitor.template apply<int64_t>();
   } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
+    PADDLE_THROW(errors::InvalidArgument(
         "The recieved values data type %s can not meet input requirements. "
         "Because the given values data type of searchsorted operators must be "
         "float32, float64, int32 or int64. Please input appropriate "
         "sorted_sequence again! ",
-        framework::DataTypeToString(type)));
+        type));
   }
 }
 
-template <typename DeviceContext, typename T>
-class SearchSortedKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* sorted_sequence = context.Input<Tensor>("SortedSequence");
-    auto* value = context.Input<Tensor>("Values");
-    bool out_int32 = context.Attr<bool>("out_int32");
-    bool right = context.Attr<bool>("right");
-    auto* out = context.Output<Tensor>("Out");
-
-    if (out_int32) {
-      int* out_data = out->mutable_data<int>(context.GetPlace());
-      SearchSortedFunctor<DeviceContext, T, int> functor(
-          context, sorted_sequence, value, right, out_data);
-      VisitDataType(framework::TransToProtoVarType(value->dtype()), functor);
-    } else {
-      int64_t* out_data = out->mutable_data<int64_t>(context.GetPlace());
-      SearchSortedFunctor<DeviceContext, T, int64_t> functor(
-          context, sorted_sequence, value, right, out_data);
-      VisitDataType(framework::TransToProtoVarType(value->dtype()), functor);
-    }
+template <typename T, typename Context>
+void SearchsortedKernel(const Context& ctx,
+                        const DenseTensor& sorted_sequence,
+                        const DenseTensor& value,
+                        bool out_int32,
+                        bool right,
+                        DenseTensor* out) {
+  if (out_int32) {
+    ctx.template Alloc<int>(out);
+    int* out_data = out->data<int>();
+    SearchSortedFunctor<Context, T, int> functor(
+        ctx, &sorted_sequence, &value, right, out_data);
+    VisitDataType(value.dtype(), functor);
+  } else {
+    ctx.template Alloc<int64_t>(out);
+    int64_t* out_data = out->data<int64_t>();
+    SearchSortedFunctor<Context, T, int64_t> functor(
+        ctx, &sorted_sequence, &value, right, out_data);
+    VisitDataType(value.dtype(), functor);
   }
-};
+}
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
new file mode 100644
index 0000000000000..4ba1a0c6b6c0f
--- /dev/null
+++ b/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SegmentPoolGradKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& segment_ids,
+                           const DenseTensor& out,
+                           paddle::optional<const DenseTensor&> summed_ids,
+                           const DenseTensor& out_grad,
+                           const std::string& pooltype,
+                           DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, x_grad, static_cast<T>(0));
+
+  auto index_type = segment_ids.type();
+  if (index_type == DataType::INT32) {
+    phi::funcs::SegmentPoolGradFunctor<Context, T, int> pool;
+    pool(dev_ctx, x, out, out_grad, segment_ids, x_grad, summed_ids, pooltype);
+  } else if (index_type == DataType::INT64) {
+    phi::funcs::SegmentPoolGradFunctor<Context, T, int64_t> pool;
+    pool(dev_ctx, x, out, out_grad, segment_ids, x_grad, summed_ids, pooltype);
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Unsupported index type, Expected int, int64, but got %s.",
+        index_type));
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
new file mode 100644
index 0000000000000..8a6df37ab3e35
--- /dev/null
+++ b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndexT>
+void SegmentKernelLaunchHelper(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& segment_ids,
+                               const std::string& pooltype,
+                               DenseTensor* out,
+                               DenseTensor* summed_ids) {
+  int64_t num_indices = segment_ids.numel();
+  PADDLE_ENFORCE_EQ(
+      num_indices,
+      x.dims()[0],
+      phi::errors::InvalidArgument(
+          "Segment_ids should be the same size as dimension 0 of input X."));
+  PADDLE_ENFORCE_EQ(num_indices,
+                    segment_ids.dims()[0],
+                    phi::errors::InvalidArgument(
+                        "Segment_ids should be 1-D tensor, or it's other "
+                        "dimension size is 1. Segment_ids's shape is: [%s].",
+                        segment_ids.dims()));
+
+  if (x.numel() == 0 || segment_ids.numel() == 0) {
+    return;
+  }
+
+  bool cpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU;
+  if (cpu_place) {
+    auto dims = x.dims();
+    auto* segment_ids_ptr = segment_ids.data<IndexT>();
+    dims[0] =
+        static_cast<int64_t>(segment_ids_ptr[segment_ids.numel() - 1] + 1);
+    PADDLE_ENFORCE_GT(
+        dims[0],
+        0,
+        phi::errors::InvalidArgument(
+            "Segment ids must be >= 0, but got last id %d", dims[0]));
+
+    out->Resize({dims});
+    dev_ctx.template Alloc<T>(out);
+
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, out, static_cast<T>(0));
+  }
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (!cpu_place) {
+    DenseTensor length;
+    length.Resize(phi::make_ddim({1}));
+    IndexT* length_data = dev_ctx.template HostAlloc<IndexT>(&length);
+
+    const IndexT* segment_ids_ptr = segment_ids.data<IndexT>();
+
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(length_data,
+                                         segment_ids_ptr + num_indices - 1,
+                                         sizeof(IndexT),
+                                         hipMemcpyDeviceToHost));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(length_data,
+                                          segment_ids_ptr + num_indices - 1,
+                                          sizeof(IndexT),
+                                          cudaMemcpyDeviceToHost));
+#endif
+
+    IndexT length_host = length_data[0];
+    length_host++;
+    PADDLE_ENFORCE_GT(
+        length_host,
+        0,
+        phi::errors::InvalidArgument(
+            "Segment ids must be >= 0, but got last id %d", length_data[0]));
+    auto dims = x.dims();
+    dims[0] = static_cast<int64_t>(length_host);
+    out->Resize({dims});
+    dev_ctx.template Alloc<T>(out);
+
+    T init_value = 0;
+    if (pooltype == "MAX") {
+      init_value = static_cast<T>(-FLT_MAX);
+    } else if (pooltype == "MIN") {
+      init_value = static_cast<T>(FLT_MAX);
+    }
+    phi::funcs::SetConstant<Context, T> setconst;
+    setconst(dev_ctx, out, static_cast<T>(init_value));
+    // the gpu kernel of mean pool record the counts of segment_ids
+    if (pooltype == "MEAN") {
+      summed_ids->Resize({dims[0], 1});
+      dev_ctx.template Alloc<T>(summed_ids);
+      setconst(dev_ctx, summed_ids, static_cast<T>(1e-12));
+    }
+  }
+#endif
+
+  phi::funcs::SegmentPoolFunctor<Context, T, IndexT> pool;
+
+  pool(dev_ctx, x, segment_ids, out, summed_ids, pooltype);
+}
+
+template <typename T, typename Context>
+void SegmentPoolKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& segment_ids,
+                       const std::string& pooltype,
+                       DenseTensor* out,
+                       DenseTensor* summed_ids) {
+  auto index_type = segment_ids.dtype();
+  if (index_type == DataType::INT32) {
+    SegmentKernelLaunchHelper<Context, T, int>(
+        dev_ctx, x, segment_ids, pooltype, out, summed_ids);
+  } else if (index_type == DataType::INT64) {
+    SegmentKernelLaunchHelper<Context, T, int64_t>(
+        dev_ctx, x, segment_ids, pooltype, out, summed_ids);
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Unsupported index type, Expected int, int64, but got %s.",
+        index_type));
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
new file mode 100644
index 0000000000000..4947170088cba
--- /dev/null
+++ b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
@@ -0,0 +1,344 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#include "paddle/fluid/operators/strided_slice_op.h"
+
+namespace phi {
+
+inline void GetOffsets(const DDim& big_dim,
+                       const DDim& small_dim,
+                       DDim start_offset,
+                       int cur_dim,
+                       std::vector<DDim>* offsets) {
+  if (cur_dim == big_dim.size()) {
+    offsets->push_back(start_offset);
+    return;
+  }
+  if (small_dim[cur_dim] == big_dim[cur_dim]) {
+    GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets);
+  } else {
+    for (int i = 0; i < big_dim[cur_dim]; i++) {
+      GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets);
+      start_offset[cur_dim] += 1;
+    }
+  }
+}
+
+template <typename T, typename Context, size_t RANK>
+void SetValueGradImpl(const Context& dev_ctx,
+                      const DenseTensor& out_grad,
+                      const ScalarArray& starts,
+                      const ScalarArray& ends,
+                      const ScalarArray& steps,
+                      const std::vector<int64_t>& axes,
+                      const std::vector<int64_t>& decrease_axes,
+                      const std::vector<int64_t>& none_axes,
+                      DenseTensor* x_grad,
+                      DenseTensor* value_grad) {
+  PADDLE_ENFORCE_EQ(
+      out_grad.IsInitialized(),
+      true,
+      errors::PermissionDenied(
+          "The input of `set_value_grad`(out_grad) has not been initialized"));
+
+  auto in_dims = out_grad.dims();
+
+  std::vector<int> decrease_axis_int32(decrease_axes.begin(),
+                                       decrease_axes.end());
+  std::vector<int> axes_int32(axes.begin(), axes.end());
+  std::vector<int> infer_flags(axes.size(), 1);
+  std::vector<int64_t> out_dims_vector(in_dims.size(), -1);
+  std::vector<int64_t> starts_local = starts.GetData();
+  std::vector<int64_t> ends_local = ends.GetData();
+  std::vector<int64_t> steps_local = steps.GetData();
+  paddle::operators::StridedSliceOutDims(starts_local,
+                                         ends_local,
+                                         steps_local,
+                                         axes_int32,
+                                         infer_flags,
+                                         in_dims,
+                                         decrease_axis_int32,
+                                         out_dims_vector.data(),
+                                         axes.size(),
+                                         false);
+
+  DDim out_dims(phi::make_ddim(out_dims_vector));
+
+  std::vector<int> reverse_vector(starts_local.size(), 0);
+  paddle::operators::StridedSliceFunctor(starts_local.data(),
+                                         ends_local.data(),
+                                         steps_local.data(),
+                                         axes_int32.data(),
+                                         reverse_vector.data(),
+                                         in_dims,
+                                         infer_flags,
+                                         decrease_axis_int32,
+                                         starts_local.size());
+
+  auto starts_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+  auto ends_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+  auto steps_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+  auto reverse_axis = Eigen::array<bool, RANK>();
+
+  for (size_t axis = 0; axis < RANK; axis++) {
+    starts_indices[axis] = 0;
+    ends_indices[axis] = out_dims[axis];
+    steps_indices[axis] = 1;
+    reverse_axis[axis] = false;
+  }
+
+  for (size_t axis = 0; axis < axes.size(); axis++) {
+    int axis_index = axes[axis];
+    starts_indices[axis_index] = starts_local[axis];
+    ends_indices[axis_index] = ends_local[axis];
+    steps_indices[axis_index] = steps_local[axis];
+    reverse_axis[axis_index] = (reverse_vector[axis] == 1) ? true : false;
+  }
+
+  bool need_reverse = false;
+  for (size_t axis = 0; axis < axes.size(); axis++) {
+    if (reverse_vector[axis] == 1) {
+      need_reverse = true;
+      break;
+    }
+  }
+
+  auto& place = *dev_ctx.eigen_device();
+  phi::funcs::SetConstant<Context, T> set_zero;
+
+  if (x_grad) {
+    // Set gradient of `Input`
+    Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+
+    auto x_grad_t =
+        EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(*x_grad);
+
+    DenseTensor tmp = Full<T>(dev_ctx, out_dims_vector, static_cast<T>(0));
+    auto tmp_t =
+        EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(tmp);
+
+    x_grad_t.stridedSlice(starts_indices, ends_indices, steps_indices)
+        .device(place) = tmp_t;
+  }
+  if (value_grad) {
+    dev_ctx.template Alloc<T>(value_grad);
+    set_zero(dev_ctx, value_grad, static_cast<T>(0));
+
+    auto in_t = EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(
+        out_grad);
+
+    if (value_grad->dims() == out_dims) {
+      auto value_grad_t =
+          EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              *value_grad);
+      if (need_reverse) {
+        DenseTensor tmp = Full<T>(dev_ctx, out_dims_vector, static_cast<T>(0));
+        auto tmp_t =
+            EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(tmp);
+
+        tmp_t.device(place) =
+            in_t.stridedSlice(starts_indices, ends_indices, steps_indices);
+        value_grad_t.device(place) = tmp_t.reverse(reverse_axis);
+      } else {
+        value_grad_t.device(place) =
+            in_t.stridedSlice(starts_indices, ends_indices, steps_indices);
+      }
+    } else {
+      int out_dims_size = out_dims.size();
+      auto value_grad_dims = value_grad->dims();
+      auto fake_value_grad_dims = out_dims;
+
+      // Create an extented shape according to the rules of broadcast.
+      auto value_grad_dims_size = value_grad_dims.size();
+
+      int num_decrease = 0;
+
+      int decrease_axis_size = decrease_axes.size();
+      for (int i = 0; i < out_dims_size; i++) {
+        if (decrease_axes.end() !=
+            std::find(decrease_axes.begin(), decrease_axes.end(), i)) {
+          fake_value_grad_dims[i] = 1;
+          num_decrease++;
+        } else if (i < out_dims_size - (value_grad_dims_size +
+                                        decrease_axis_size - num_decrease)) {
+          fake_value_grad_dims[i] = 1;
+        } else {
+          auto index_grad =
+              i - (out_dims_size -
+                   (value_grad_dims_size + decrease_axis_size - num_decrease));
+          fake_value_grad_dims[i] = value_grad_dims[index_grad];
+
+          PADDLE_ENFORCE_EQ((out_dims[i] == value_grad_dims[index_grad]) ||
+                                (value_grad_dims[index_grad] == 1),
+                            true,
+                            errors::InvalidArgument(
+                                "An error occurred while calculating %s: "
+                                "[%s] can not be accumulated into [%s].",
+                                paddle::framework::GradVarName("ValueTensor"),
+                                out_dims,
+                                value_grad_dims));
+        }
+      }
+
+      VLOG(3) << "Dimensions of "
+              << paddle::framework::GradVarName("ValueTensor") << "(["
+              << value_grad_dims << "])is broadcasted into ["
+              << fake_value_grad_dims << "].";
+
+      auto extent = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+      auto offset = out_dims;
+      for (int i = 0; i < out_dims_size; i++) {
+        offset[i] = 0;
+        extent[i] = fake_value_grad_dims[i];
+      }
+      std::vector<DDim> offsets;
+      GetOffsets(out_dims, fake_value_grad_dims, offset, 0, &offsets);
+
+      auto value_grad_t =
+          EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              *value_grad, fake_value_grad_dims);
+
+      DenseTensor tmp = Full<T>(dev_ctx, out_dims_vector, static_cast<T>(0));
+      auto tmp_t =
+          EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(tmp);
+
+      tmp_t.device(place) =
+          in_t.stridedSlice(starts_indices, ends_indices, steps_indices);
+
+      // accumulate gradient
+      for (auto offset : offsets) {
+        value_grad_t.device(place) =
+            value_grad_t + tmp_t.slice(EigenDim<RANK>::From(offset), extent);
+      }
+      if (need_reverse) {
+        DenseTensor tmp_value =
+            Full<T>(dev_ctx,
+                    {fake_value_grad_dims.Get(), fake_value_grad_dims.size()},
+                    static_cast<T>(0));
+        auto tmp_value_t =
+            EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(
+                tmp_value);
+        tmp_value_t.device(place) = value_grad_t.reverse(reverse_axis);
+        value_grad_t.device(place) = tmp_value_t;
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void SetValueGradKernel(const Context& dev_ctx,
+                        const DenseTensor& out_grad,
+                        const ScalarArray& starts,
+                        const ScalarArray& ends,
+                        const ScalarArray& steps,
+                        const std::vector<int64_t>& axes,
+                        const std::vector<int64_t>& decrease_axes,
+                        const std::vector<int64_t>& none_axes,
+                        DenseTensor* x_grad,
+                        DenseTensor* value_grad) {
+  const int rank = out_grad.dims().size();
+
+  switch (rank) {
+    case 1:
+      SetValueGradImpl<T, Context, 1>(dev_ctx,
+                                      out_grad,
+                                      starts,
+                                      ends,
+                                      steps,
+                                      axes,
+                                      decrease_axes,
+                                      none_axes,
+                                      x_grad,
+                                      value_grad);
+      break;
+    case 2:
+      SetValueGradImpl<T, Context, 2>(dev_ctx,
+                                      out_grad,
+                                      starts,
+                                      ends,
+                                      steps,
+                                      axes,
+                                      decrease_axes,
+                                      none_axes,
+                                      x_grad,
+                                      value_grad);
+      break;
+    case 3:
+      SetValueGradImpl<T, Context, 3>(dev_ctx,
+                                      out_grad,
+                                      starts,
+                                      ends,
+                                      steps,
+                                      axes,
+                                      decrease_axes,
+                                      none_axes,
+                                      x_grad,
+                                      value_grad);
+      break;
+    case 4:
+      SetValueGradImpl<T, Context, 4>(dev_ctx,
+                                      out_grad,
+                                      starts,
+                                      ends,
+                                      steps,
+                                      axes,
+                                      decrease_axes,
+                                      none_axes,
+                                      x_grad,
+                                      value_grad);
+      break;
+    case 5:
+      SetValueGradImpl<T, Context, 5>(dev_ctx,
+                                      out_grad,
+                                      starts,
+                                      ends,
+                                      steps,
+                                      axes,
+                                      decrease_axes,
+                                      none_axes,
+                                      x_grad,
+                                      value_grad);
+      break;
+    case 6:
+      SetValueGradImpl<T, Context, 6>(dev_ctx,
+                                      out_grad,
+                                      starts,
+                                      ends,
+                                      steps,
+                                      axes,
+                                      decrease_axes,
+                                      none_axes,
+                                      x_grad,
+                                      value_grad);
+      break;
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "The rank of set_value_grad's input should be less than 7, but "
+          "received %d.",
+          rank));
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/set_value_kernel_impl.h b/paddle/phi/kernels/impl/set_value_kernel_impl.h
index 5aebffe51b5e3..99db559f3b816 100644
--- a/paddle/phi/kernels/impl/set_value_kernel_impl.h
+++ b/paddle/phi/kernels/impl/set_value_kernel_impl.h
@@ -25,7 +25,6 @@
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/slice_utils.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/impl/softmax_kernel_impl.h b/paddle/phi/kernels/impl/softmax_kernel_impl.h
index 6552f6ed581f4..7aa43fdb7f270 100644
--- a/paddle/phi/kernels/impl/softmax_kernel_impl.h
+++ b/paddle/phi/kernels/impl/softmax_kernel_impl.h
@@ -22,10 +22,10 @@ limitations under the License. */
 namespace phi {
 
 template <typename T, typename Context>
-void SoftmaxRawKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      int axis,
-                      DenseTensor* out) {
+void SoftmaxKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   int axis,
+                   DenseTensor* out) {
   const int rank = x.dims().size();
   const int calc_axis = phi::funcs::CanonicalAxis(axis, rank);
   int axis_dim = x.dims()[calc_axis];
diff --git a/paddle/phi/kernels/impl/tile_grad_kernel_impl.h b/paddle/phi/kernels/impl/tile_grad_kernel_impl.h
new file mode 100644
index 0000000000000..a2c2720244fe8
--- /dev/null
+++ b/paddle/phi/kernels/impl/tile_grad_kernel_impl.h
@@ -0,0 +1,147 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <type_traits>
+#include <vector>
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/tile_grad_kernel.h"
+
+namespace phi {
+
+template <typename Context, typename T, int Dims>
+void TileBackward(const Context& dev_ctx,
+                  const DenseTensor& out_grad,
+                  const std::vector<int>& reshape_dims_vec,
+                  const std::vector<int>& reduce_dims_vec,
+                  DenseTensor* x_grad) {
+  size_t reshape_size = reshape_dims_vec.size();
+  size_t reduce_size = reduce_dims_vec.size();
+  dev_ctx.template Alloc<T>(x_grad);
+
+  auto eigen_x_grad = EigenVector<T>::Flatten(*x_grad);
+  Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
+  for (size_t i = 0; i < reshape_size; ++i) {
+    reshape_dims[i] = reshape_dims_vec[i];
+  }
+  Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
+  for (size_t i = 0; i < reduce_size; ++i) {
+    reduce_dims[i] = reduce_dims_vec[i];
+  }
+
+  auto eigen_out_grad = EigenVector<T>::Flatten(out_grad);
+  auto& place = *dev_ctx.eigen_device();
+  funcs::EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+      place, eigen_x_grad, eigen_out_grad, reduce_dims, reshape_dims);
+}
+
+template <typename T, typename Context>
+void TileGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& repeat_times,
+                    DenseTensor* x_grad) {
+  auto x_dims = x.dims();
+  auto vec_x_dims = phi::vectorize<int>(x_dims);
+  auto repeat_times_data = repeat_times.GetData();
+  if (repeat_times_data.size() < vec_x_dims.size()) {
+    int diff = vec_x_dims.size() - repeat_times_data.size();
+    repeat_times_data.insert(repeat_times_data.begin(), diff, 1);
+  } else {
+    int diff = repeat_times_data.size() - vec_x_dims.size();
+    vec_x_dims.insert(vec_x_dims.begin(), diff, 1);
+  }
+  // 1. reshape_dims_vec is the broadcast parameter.
+  // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
+  //    each dimension expanded, the gradients should be summed to original
+  //    size.
+  std::vector<int> reshape_dims_vec;
+  std::vector<int> reduce_dims_vec;
+  for (size_t i = 0; i < repeat_times_data.size(); ++i) {
+    reduce_dims_vec.push_back(reshape_dims_vec.size());
+    reshape_dims_vec.push_back(repeat_times_data[i]);
+    reshape_dims_vec.push_back(vec_x_dims[i]);
+  }
+
+  int dims = reduce_dims_vec.size();
+
+  bool just_copy = true;
+  for (size_t i = 0; i < repeat_times_data.size(); i++) {
+    if (repeat_times_data[i] != 1) {
+      just_copy = false;
+      break;
+    }
+  }
+  // no need reduce, just copy
+  if (just_copy) {
+    dev_ctx.template Alloc<T>(x_grad);
+
+    paddle::framework::TensorCopy(
+        out_grad, dev_ctx.GetPlace(), dev_ctx, x_grad);
+    // TensorCopy may change the dims of dx
+    x_grad->Resize(x_dims);
+  } else {
+    PADDLE_ENFORCE_GE(dims,
+                      1,
+                      errors::InvalidArgument(
+                          "Th rank of the input 'Out@GRAD' for tile_grad op "
+                          " must be greater than or equal to 1, but "
+                          "the value received is %d.",
+                          dims));
+    PADDLE_ENFORCE_LE(dims,
+                      MAX_RANK_SUPPORTED,
+                      errors::InvalidArgument(
+                          "The rank of the input 'Out@GRAD' for tile_grad op "
+                          "must be less than or equal "
+                          "to %d, but the value received is %d.",
+                          MAX_RANK_SUPPORTED,
+                          dims));
+    switch (dims) {
+      case 1:
+        TileBackward<Context, T, 1>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      case 2:
+        TileBackward<Context, T, 2>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      case 3:
+        TileBackward<Context, T, 3>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      case 4:
+        TileBackward<Context, T, 4>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      case 5:
+        TileBackward<Context, T, 5>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      case 6:
+        TileBackward<Context, T, 6>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      default:
+        PADDLE_THROW(errors::InvalidArgument(
+            "Only support tensor with rank being between 1 and 6. But "
+            "received tensor's rank = %d.",
+            dims));
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/tile_kernel_impl.h b/paddle/phi/kernels/impl/tile_kernel_impl.h
new file mode 100644
index 0000000000000..bafbbde4e680d
--- /dev/null
+++ b/paddle/phi/kernels/impl/tile_kernel_impl.h
@@ -0,0 +1,117 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <type_traits>
+#include <vector>
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/tile_kernel.h"
+
+namespace phi {
+
+template <typename Context, typename T, int Rank>
+void Tile(const Context& dev_ctx,
+          const DenseTensor& x,
+          std::vector<int64_t> repeat_times,
+          DenseTensor* out) {
+  auto x_dims = x.dims();
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    PADDLE_ENFORCE_GT(
+        repeat_times[i],
+        0,
+        errors::InvalidArgument(
+            "All elements of the input 'repeat_times' for tile op must "
+            "be positive integers, but the value received is %d.",
+            repeat_times[i]));
+  }
+  auto vec_x_dims = phi::vectorize<int>(x_dims);
+  if (repeat_times.size() < vec_x_dims.size()) {
+    int diff = vec_x_dims.size() - repeat_times.size();
+    repeat_times.insert(repeat_times.begin(), diff, 1);
+  } else {
+    int diff = repeat_times.size() - vec_x_dims.size();
+    vec_x_dims.insert(vec_x_dims.begin(), diff, 1);
+  }
+  PADDLE_ENFORCE_EQ(
+      repeat_times.size(),
+      vec_x_dims.size(),
+      errors::InvalidArgument(
+          "The rank (%d) of the input 'x' and the rank (%d) of the input "
+          "'repeat_times' for tile op must match after promotion.",
+          vec_x_dims.size(),
+          repeat_times.size()));
+
+  Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    bcast_dims[i] = repeat_times[i];
+  }
+
+  DDim new_x_dims = make_ddim(vec_x_dims);
+  DDim out_dims(new_x_dims);
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    out_dims[i] *= repeat_times[i];
+  }
+
+  out->Resize(out_dims);
+  auto eigen_x = EigenTensor<T, Rank>::From(x, new_x_dims);
+  dev_ctx.template Alloc<T>(out);
+
+  auto eigen_out = EigenTensor<T, Rank>::From(*out, out_dims);
+  auto& place = *dev_ctx.eigen_device();
+  // use 32-bit index to speed up
+  bool use_32bit_index = eigen_out.size() < Eigen::NumTraits<int>::highest();
+  if (use_32bit_index) {
+    funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+        place, To32BitIndex(eigen_out), To32BitIndex(eigen_x), bcast_dims);
+  } else {
+    funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+        place, eigen_out, eigen_x, bcast_dims);
+  }
+}
+
+template <typename T, typename Context>
+void TileKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& repeat_times,
+                DenseTensor* out) {
+  auto rank = x.dims().size();
+  auto& repeat_times_data = repeat_times.GetData();
+  int repeat_times_size = repeat_times_data.size();
+  rank = std::max(rank, repeat_times_size);
+
+  switch (rank) {
+    case 1:
+      Tile<Context, T, 1>(dev_ctx, x, repeat_times_data, out);
+      break;
+    case 2:
+      Tile<Context, T, 2>(dev_ctx, x, repeat_times_data, out);
+      break;
+    case 3:
+      Tile<Context, T, 3>(dev_ctx, x, repeat_times_data, out);
+      break;
+    case 4:
+      Tile<Context, T, 4>(dev_ctx, x, repeat_times_data, out);
+      break;
+    case 5:
+      Tile<Context, T, 5>(dev_ctx, x, repeat_times_data, out);
+      break;
+    case 6:
+      Tile<Context, T, 6>(dev_ctx, x, repeat_times_data, out);
+      break;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
index 9b1e4b1d3a65d..044adb0230cac 100644
--- a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
@@ -21,12 +21,11 @@
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/matrix_reduce.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
 
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/operators/tril_triu_op.h"
-
 namespace phi {
 
 template <typename T, typename Context>
@@ -119,7 +118,7 @@ void TriangularSolveGradKernel(const Context& dev_ctx,
     const auto H = dims[dims.size() - 2];
     const auto W = dims[dims.size() - 1];
     phi::funcs::ForRange<Context> x_for_range(dev_ctx, dx_bst.numel());
-    paddle::operators::TrilTriuCompute<T> tril_triu_functor(
+    phi::funcs::TrilTriuCompute<T> tril_triu_functor(
         dx_bst.data<T>(), unitriangular, !upper, H, W, dx_bst_upper.data<T>());
     x_for_range(tril_triu_functor);
 
diff --git a/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h b/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h
new file mode 100644
index 0000000000000..dcc7224b5075c
--- /dev/null
+++ b/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/tril_triu_grad_kernel.h"
+
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TrilTriuGradKernel(const Context& ctx,
+                        const DenseTensor& out_grad,
+                        int diagonal,
+                        bool lower,
+                        DenseTensor* x_grad) {
+  const auto* dout_data = out_grad.data<T>();
+  auto* dx_data = ctx.template Alloc<T>(x_grad);
+
+  const auto& dims = out_grad.dims();
+  const auto H = dims[dims.size() - 2];
+  const auto W = dims[dims.size() - 1];
+
+  phi::funcs::ForRange<Context> for_range(
+      ctx, static_cast<size_t>(out_grad.numel()));
+  phi::funcs::TrilTriuCompute<T> tril_triu_grad_computer(
+      dout_data, diagonal, lower, H, W, dx_data);
+  for_range(tril_triu_grad_computer);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/tril_triu_kernel_impl.h b/paddle/phi/kernels/impl/tril_triu_kernel_impl.h
new file mode 100644
index 0000000000000..959169d87cefd
--- /dev/null
+++ b/paddle/phi/kernels/impl/tril_triu_kernel_impl.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/tril_triu_kernel.h"
+
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TrilTriuKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    int diagonal,
+                    bool lower,
+                    DenseTensor* out) {
+  const auto* x_data = x.data<T>();
+  auto* out_data = ctx.template Alloc<T>(out);
+
+  const auto& dims = x.dims();
+  const auto H = dims[dims.size() - 2];
+  const auto W = dims[dims.size() - 1];
+  phi::funcs::ForRange<Context> for_range(ctx, static_cast<size_t>(x.numel()));
+
+  phi::funcs::TrilTriuCompute<T> tril_triu_computer(
+      x_data, diagonal, lower, H, W, out_data);
+  for_range(tril_triu_computer);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/index_select_grad_kernel.h b/paddle/phi/kernels/index_select_grad_kernel.h
new file mode 100644
index 0000000000000..c3dc1595989bf
--- /dev/null
+++ b/paddle/phi/kernels/index_select_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IndexSelectGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& index,
+                           const DenseTensor& out_grad,
+                           int dim,
+                           DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/index_select_kernel.h b/paddle/phi/kernels/index_select_kernel.h
new file mode 100644
index 0000000000000..124b689731157
--- /dev/null
+++ b/paddle/phi/kernels/index_select_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IndexSelectKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& index,
+                       int dim,
+                       DenseTensor* output);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/isclose_kernel.h b/paddle/phi/kernels/isclose_kernel.h
new file mode 100644
index 0000000000000..8c468da055082
--- /dev/null
+++ b/paddle/phi/kernels/isclose_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IscloseKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const Scalar& rtol,
+                   const Scalar& atol,
+                   bool equal_nan,
+                   DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/kldiv_loss_grad_kernel.h b/paddle/phi/kernels/kldiv_loss_grad_kernel.h
new file mode 100644
index 0000000000000..8f53898fa6816
--- /dev/null
+++ b/paddle/phi/kernels/kldiv_loss_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+// XKTODO (change name)
+void KLDivLossGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& label,
+                         const DenseTensor& d_out,
+                         const std::string& reduction,
+                         DenseTensor* d_x);
+}  // namespace phi
diff --git a/paddle/phi/kernels/kldiv_loss_kernel.h b/paddle/phi/kernels/kldiv_loss_kernel.h
new file mode 100644
index 0000000000000..103780ab74728
--- /dev/null
+++ b/paddle/phi/kernels/kldiv_loss_kernel.h
@@ -0,0 +1,29 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void KLDivLossKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& label,
+                     const std::string& reduction,
+                     DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/kron_grad_kernel.h b/paddle/phi/kernels/kron_grad_kernel.h
new file mode 100644
index 0000000000000..3daa9dcfba9f0
--- /dev/null
+++ b/paddle/phi/kernels/kron_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void KronGradKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const DenseTensor& out_grad,
+                    DenseTensor* x_grad,
+                    DenseTensor* y_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/kron_kernel.h b/paddle/phi/kernels/kron_kernel.h
new file mode 100644
index 0000000000000..4451ac757a953
--- /dev/null
+++ b/paddle/phi/kernels/kron_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void KronKernel(const Context& ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/kthvalue_grad_kernel.h b/paddle/phi/kernels/kthvalue_grad_kernel.h
new file mode 100644
index 0000000000000..488dde8237b08
--- /dev/null
+++ b/paddle/phi/kernels/kthvalue_grad_kernel.h
@@ -0,0 +1,30 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+template <typename T, typename Context>
+void KthvalueGradKernel(const Context& dev_ctx,
+                        const DenseTensor& d_out,
+                        const DenseTensor& x,
+                        const DenseTensor& indices,
+                        int k,
+                        int axis,
+                        bool keepdim,
+                        DenseTensor* d_x);
+}  // namespace phi
diff --git a/paddle/phi/kernels/kthvalue_kernel.h b/paddle/phi/kernels/kthvalue_kernel.h
new file mode 100644
index 0000000000000..4809b9af4832f
--- /dev/null
+++ b/paddle/phi/kernels/kthvalue_kernel.h
@@ -0,0 +1,30 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void KthvalueKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    int k,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* out,
+                    DenseTensor* indices);
+}  // namespace phi
diff --git a/paddle/phi/kernels/layer_norm_grad_kernel.h b/paddle/phi/kernels/layer_norm_grad_kernel.h
new file mode 100644
index 0000000000000..c32be63db4178
--- /dev/null
+++ b/paddle/phi/kernels/layer_norm_grad_kernel.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& mean,
+                         const DenseTensor& variance,
+                         paddle::optional<const DenseTensor&> scale,
+                         paddle::optional<const DenseTensor&> bias,
+                         const DenseTensor& out_grad,
+                         float epsilon,
+                         int begin_norm_axis,
+                         bool is_test,
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/layer_norm_kernel.h b/paddle/phi/kernels/layer_norm_kernel.h
new file mode 100644
index 0000000000000..c9679420bda5c
--- /dev/null
+++ b/paddle/phi/kernels/layer_norm_kernel.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     paddle::optional<const DenseTensor&> scale,
+                     paddle::optional<const DenseTensor&> bias,
+                     float epsilon,
+                     int begin_norm_axis,
+                     bool is_test,
+                     DenseTensor* out,
+                     DenseTensor* mean,
+                     DenseTensor* variance);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename T>
+class LayerNormDirectCUDAFunctor {
+ public:
+  void operator()(gpuStream_t stream,
+                  const T* input,
+                  std::vector<int> input_shape,
+                  const T* bias,
+                  const T* scale,
+                  T* output,
+                  T* mean,
+                  T* variance,
+                  int begin_norm_axis,
+                  float eps);
+};
+#endif
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/lgamma_grad_kernel.h b/paddle/phi/kernels/lgamma_grad_kernel.h
new file mode 100644
index 0000000000000..94173cc29c7a7
--- /dev/null
+++ b/paddle/phi/kernels/lgamma_grad_kernel.h
@@ -0,0 +1,27 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LgammaGradKernel(const Context& dev_ctx,
+                      const DenseTensor& d_out,
+                      const DenseTensor& x,
+                      DenseTensor* d_x);
+}  // namespace phi
diff --git a/paddle/phi/kernels/lgamma_kernel.h b/paddle/phi/kernels/lgamma_kernel.h
new file mode 100644
index 0000000000000..f61b3a1ce859e
--- /dev/null
+++ b/paddle/phi/kernels/lgamma_kernel.h
@@ -0,0 +1,26 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LgammaKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/log_softmax_grad_kernel.h b/paddle/phi/kernels/log_softmax_grad_kernel.h
new file mode 100644
index 0000000000000..6336bc14105bb
--- /dev/null
+++ b/paddle/phi/kernels/log_softmax_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogSoftmaxGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& out_grad,
+                          int axis,
+                          DenseTensor* x_grad);
+}  // namespace phi
diff --git a/paddle/phi/kernels/log_softmax_kernel.h b/paddle/phi/kernels/log_softmax_kernel.h
new file mode 100644
index 0000000000000..2caaa86d46c35
--- /dev/null
+++ b/paddle/phi/kernels/log_softmax_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogSoftmaxKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      int axis,
+                      DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/matrix_power_grad_kernel.h b/paddle/phi/kernels/matrix_power_grad_kernel.h
new file mode 100644
index 0000000000000..4f70cf6e34d49
--- /dev/null
+++ b/paddle/phi/kernels/matrix_power_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixPowerGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& out,
+                           const DenseTensor& out_grad,
+                           int n,
+                           DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/matrix_power_kernel.h b/paddle/phi/kernels/matrix_power_kernel.h
new file mode 100644
index 0000000000000..39a1bc85e3fe7
--- /dev/null
+++ b/paddle/phi/kernels/matrix_power_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixPowerKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       int n,
+                       DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/matrix_rank_kernel.h b/paddle/phi/kernels/matrix_rank_kernel.h
new file mode 100644
index 0000000000000..6edea2723e589
--- /dev/null
+++ b/paddle/phi/kernels/matrix_rank_kernel.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixRankKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      float tol,
+                      bool use_default_tol,
+                      bool hermitian,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/matrix_rank_tol_kernel.h b/paddle/phi/kernels/matrix_rank_tol_kernel.h
new file mode 100644
index 0000000000000..351358dfa04aa
--- /dev/null
+++ b/paddle/phi/kernels/matrix_rank_tol_kernel.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixRankTolKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& atol_tensor,
+                         bool use_default_tol,
+                         bool hermitian,
+                         DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/mode_grad_kernel.h b/paddle/phi/kernels/mode_grad_kernel.h
new file mode 100644
index 0000000000000..ccde8c3648fa5
--- /dev/null
+++ b/paddle/phi/kernels/mode_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ModeGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    const DenseTensor& out_grad,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/mode_kernel.h b/paddle/phi/kernels/mode_kernel.h
new file mode 100644
index 0000000000000..831c4369304e5
--- /dev/null
+++ b/paddle/phi/kernels/mode_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ModeKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int axis,
+                bool keepdim,
+                DenseTensor* out,
+                DenseTensor* indices);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/multiplex_grad_kernel.h b/paddle/phi/kernels/multiplex_grad_kernel.h
new file mode 100644
index 0000000000000..b32c9dbe10058
--- /dev/null
+++ b/paddle/phi/kernels/multiplex_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplexGradKernel(const Context& ctx,
+                         const DenseTensor& ids,
+                         const DenseTensor& out_grad,
+                         std::vector<DenseTensor*> ins_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/multiplex_kernel.h b/paddle/phi/kernels/multiplex_kernel.h
new file mode 100644
index 0000000000000..341c6d5cabb7c
--- /dev/null
+++ b/paddle/phi/kernels/multiplex_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplexKernel(const Context& ctx,
+                     const std::vector<const DenseTensor*>& ins,
+                     const DenseTensor& ids,
+                     DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_max_kernel.cc b/paddle/phi/kernels/one_hot_kernel.cc
similarity index 62%
rename from paddle/phi/kernels/reduce_max_kernel.cc
rename to paddle/phi/kernels/one_hot_kernel.cc
index de172a12d7288..633f48cbb62ac 100644
--- a/paddle/phi/kernels/reduce_max_kernel.cc
+++ b/paddle/phi/kernels/one_hot_kernel.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_max_kernel.h"
+#include "paddle/phi/kernels/one_hot_kernel.h"
 
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -20,20 +20,19 @@
 namespace phi {
 
 template <typename T, typename Context>
-void MaxKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               bool keep_dim,
-               DenseTensor* out) {
-  bool reduce_all = false;
-  MaxRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+void OneHotKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const Scalar& num_classes_s,
+                  DenseTensor* out) {
+  int num_classes = num_classes_s.to<int>();
+  OneHotRawKernel<T>(
+      dev_ctx, x, num_classes, phi::DataType::FLOAT32, false, out);
 }
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(one_hot, CPU, ALL_LAYOUT, phi::OneHotKernel, int, int64_t) {}
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(
-    max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(one_hot, GPU, ALL_LAYOUT, phi::OneHotKernel, int, int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_max_kernel.h b/paddle/phi/kernels/one_hot_kernel.h
similarity index 64%
rename from paddle/phi/kernels/reduce_max_kernel.h
rename to paddle/phi/kernels/one_hot_kernel.h
index 7560473d43c71..9f89609ea6336 100644
--- a/paddle/phi/kernels/reduce_max_kernel.h
+++ b/paddle/phi/kernels/one_hot_kernel.h
@@ -14,25 +14,23 @@
 
 #pragma once
 
+#include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/infermeta/binary.h"
-#include "paddle/phi/infermeta/unary.h"
-#include "paddle/phi/kernels/empty_kernel.h"
 
 namespace phi {
 
 template <typename T, typename Context>
-void MaxRawKernel(const Context& dev_ctx,
+void OneHotKernel(const Context& dev_ctx,
                   const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
-                  bool keep_dim,
-                  bool reduce_all,
+                  const Scalar& num_classes,
                   DenseTensor* out);
 
 template <typename T, typename Context>
-void MaxKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               bool keep_dim,
-               DenseTensor* out);
+void OneHotRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     int32_t depth,
+                     DataType dtype,
+                     bool allow_out_of_range,
+                     DenseTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/pad3d_grad_kernel.h b/paddle/phi/kernels/pad3d_grad_kernel.h
new file mode 100644
index 0000000000000..38f1e5335e8c2
--- /dev/null
+++ b/paddle/phi/kernels/pad3d_grad_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Pad3dGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& out_grad,
+                     const ScalarArray& paddings,
+                     const std::string& mode,
+                     float pad_value,
+                     const std::string& data_format,
+                     DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/pad3d_kernel.h b/paddle/phi/kernels/pad3d_kernel.h
new file mode 100644
index 0000000000000..d8876c3e7bc74
--- /dev/null
+++ b/paddle/phi/kernels/pad3d_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Pad3dKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const ScalarArray& paddings,
+                 const std::string& mode,
+                 float pad_value,
+                 const std::string& data_format,
+                 DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/pool_grad_kernel.h b/paddle/phi/kernels/pool_grad_kernel.h
new file mode 100644
index 0000000000000..0658dc22c823b
--- /dev/null
+++ b/paddle/phi/kernels/pool_grad_kernel.h
@@ -0,0 +1,145 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Pool2dGradKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      const std::vector<int>& kernel_size,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      bool ceil_mode,
+                      bool exclusive,
+                      const std::string& data_format,
+                      const std::string& pooling_type,
+                      bool global_pooling,
+                      bool adaptive,
+                      const std::string& padding_algorithm,
+                      DenseTensor* dx);
+
+template <typename T, typename Context>
+void Pool2dGradGPUDNNKernel(const Context& ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& out,
+                            const DenseTensor& dout,
+                            const std::vector<int>& kernel_size,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            bool ceil_mode,
+                            bool exclusive,
+                            const std::string& data_format,
+                            const std::string& pooling_type,
+                            bool global_pooling,
+                            bool adaptive,
+                            const std::string& padding_algorithm,
+                            DenseTensor* dx);
+
+template <typename T, typename Context>
+void Pool2dDoubleGradKernel(const Context& ctx,
+                            const DenseTensor& x,
+                            const std::vector<int>& kernel_size,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            bool ceil_mode,
+                            bool exclusive,
+                            const std::string& data_format,
+                            const std::string& pooling_type,
+                            bool global_pooling,
+                            bool adaptive,
+                            const std::string& padding_algorithm,
+                            DenseTensor* out);
+
+template <typename T, typename Context>
+void Pool2dDoubleGradGPUDNNKernel(const Context& ctx,
+                                  const DenseTensor& x,
+                                  const std::vector<int>& kernel_size,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  bool ceil_mode,
+                                  bool exclusive,
+                                  const std::string& data_format,
+                                  const std::string& pooling_type,
+                                  bool global_pooling,
+                                  bool adaptive,
+                                  const std::string& padding_algorithm,
+                                  DenseTensor* out);
+
+template <typename T, typename Context>
+void MaxPool2dWithIndexGradKernel(const Context& ctx,
+                                  const DenseTensor& x,
+                                  const DenseTensor& mask,
+                                  const DenseTensor& dout,
+                                  const std::vector<int>& kernel_size,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  bool global_pooling,
+                                  bool adaptive,
+                                  DenseTensor* dx);
+
+template <typename T, typename Context>
+void Pool3dGradKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      const std::vector<int>& kernel_size,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      bool ceil_mode,
+                      bool exclusive,
+                      const std::string& data_format,
+                      const std::string& pooling_type,
+                      bool global_pooling,
+                      bool adaptive,
+                      const std::string& padding_algorithm,
+                      DenseTensor* dx);
+
+template <typename T, typename Context>
+void Pool3dGradGPUDNNKernel(const Context& ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& out,
+                            const DenseTensor& dout,
+                            const std::vector<int>& kernel_size,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            bool ceil_mode,
+                            bool exclusive,
+                            const std::string& data_format,
+                            const std::string& pooling_type,
+                            bool global_pooling,
+                            bool adaptive,
+                            const std::string& padding_algorithm,
+                            DenseTensor* dx);
+
+template <typename T, typename Context>
+void MaxPool3dWithIndexGradKernel(const Context& ctx,
+                                  const DenseTensor& x,
+                                  const DenseTensor& mask,
+                                  const DenseTensor& dout,
+                                  const std::vector<int>& kernel_size,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  bool global_pooling,
+                                  bool adaptive,
+                                  DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/pool_kernel.h b/paddle/phi/kernels/pool_kernel.h
new file mode 100644
index 0000000000000..348af02181517
--- /dev/null
+++ b/paddle/phi/kernels/pool_kernel.h
@@ -0,0 +1,105 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Pool2dKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int>& kernel_size,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool ceil_mode,
+                  bool exclusive,
+                  const std::string& data_format,
+                  const std::string& pooling_type,
+                  bool global_pooling,
+                  bool adaptive,
+                  const std::string& padding_algorithm,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void Pool2dGPUDNNKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        const std::vector<int>& kernel_size,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        bool ceil_mode,
+                        bool exclusive,
+                        const std::string& data_format,
+                        const std::string& pooling_type,
+                        bool global_pooling,
+                        bool adaptive,
+                        const std::string& padding_algorithm,
+                        DenseTensor* out);
+
+template <typename T, typename Context>
+void MaxPool2dWithIndexKernel(const Context& ctx,
+                              const DenseTensor& x,
+                              const std::vector<int>& kernel_size,
+                              const std::vector<int>& strides,
+                              const std::vector<int>& paddings,
+                              bool global_pooling,
+                              bool adaptive,
+                              DenseTensor* out,
+                              DenseTensor* mask);
+
+template <typename T, typename Context>
+void Pool3dKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int>& kernel_size,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool ceil_mode,
+                  bool exclusive,
+                  const std::string& data_format,
+                  const std::string& pooling_type,
+                  bool global_pooling,
+                  bool adaptive,
+                  const std::string& padding_algorithm,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void Pool3dGPUDNNKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        const std::vector<int>& kernel_size,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        bool ceil_mode,
+                        bool exclusive,
+                        const std::string& data_format,
+                        const std::string& pooling_type,
+                        bool global_pooling,
+                        bool adaptive,
+                        const std::string& padding_algorithm,
+                        DenseTensor* out);
+
+template <typename T, typename Context>
+void MaxPool3dWithIndexKernel(const Context& ctx,
+                              const DenseTensor& x,
+                              const std::vector<int>& kernel_size,
+                              const std::vector<int>& strides,
+                              const std::vector<int>& paddings,
+                              bool global_pooling,
+                              bool adaptive,
+                              DenseTensor* out,
+                              DenseTensor* mask);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/prelu_grad_kernel.h b/paddle/phi/kernels/prelu_grad_kernel.h
new file mode 100644
index 0000000000000..15917e2e1f02e
--- /dev/null
+++ b/paddle/phi/kernels/prelu_grad_kernel.h
@@ -0,0 +1,31 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PReluGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& alpha,
+                     const DenseTensor& out_grad,
+                     const std::string& mode,
+                     const std::string& data_format,
+                     DenseTensor* x_grad,
+                     DenseTensor* alpha_grad);
+}  // namespace phi
diff --git a/paddle/phi/kernels/prelu_kernel.h b/paddle/phi/kernels/prelu_kernel.h
new file mode 100644
index 0000000000000..251332a8158dc
--- /dev/null
+++ b/paddle/phi/kernels/prelu_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PReluKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& alpha,
+                 const std::string& mode,
+                 const std::string& data_format,
+                 DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index 632ad00f6d06e..e02f4450a8bab 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -22,7 +22,6 @@
 #endif
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-// #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 
 namespace phi {
@@ -591,7 +590,7 @@ __device__ __forceinline__ void Cumsum(OutT* out,
     int index = (tidx + 1) * 2 * stride - 1;
     if (index < (blockDim.x * 2)) {
       temp[index + index / 32] =
-          compute(temp[index + index / 2],
+          compute(temp[index + index / 32],
                   temp[index - stride + (index - stride) / 32]);
     }
   }
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index 2f1e2f589c512..1d4181f3b9a89 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -115,6 +115,14 @@ struct BroadcastConfig {
   }
 };
 
+template <typename T>
+__device__ __forceinline__ void WriteData(T* dst,
+                                          T* __restrict__ src,
+                                          int num) {
+  for (int i = 0; i < num; i++) {
+    dst[i] = src[i];
+  }
+}
 #undef INT_BITS
 }  // namespace details
 
diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
index 53a8b7d0c9ef9..d2cfdbdec3064 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -76,6 +76,16 @@ struct BroadcastConfig {
 };
 #pragma pack()
 
+template <typename T>
+__device__ __forceinline__ void WriteData(T* _global_ptr_ dst,
+                                          T* src,
+                                          int num) {
+  if (num > 0) {
+    LM2GM(src, dst, num * sizeof(T));
+  }
+}
+#undef INT_BITS
+
 }  // namespace details
 
 /**
diff --git a/paddle/phi/kernels/primitive/kernel_primitives.h b/paddle/phi/kernels/primitive/kernel_primitives.h
index 830bc1972c49f..b5a1e88acc32b 100644
--- a/paddle/phi/kernels/primitive/kernel_primitives.h
+++ b/paddle/phi/kernels/primitive/kernel_primitives.h
@@ -13,7 +13,10 @@
 // limitations under the License.
 
 #pragma once
+
 #include "paddle/phi/kernels/primitive/helper_primitives.h"
+
+// macro
 #ifdef PADDLE_WITH_XPU_KP
 
 #define KPStream XPUStream
@@ -22,11 +25,6 @@
 #define __forceinline__ __inline__
 #define __restrict__
 
-#include "paddle/phi/backends/xpu/xpu_context.h"
-#include "paddle/phi/kernels/primitive/compute_primitives_xpu2.h"
-#include "paddle/phi/kernels/primitive/datamover_primitives_xpu2.h"
-#include "paddle/phi/kernels/primitive/functor_primitives_xpu2.h"
-
 #define THREAD_ID_X core_id()
 #define THREAD_ID_Y 0
 #define THREAD_ID_Z 0
@@ -42,11 +40,8 @@
 #define GRID_NUM_X cluster_num()
 #define GRID_NUM_Y 0
 #define GRID_NUM_Z 0
+
 #else
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/kernels/primitive/compute_primitives.h"
-#include "paddle/phi/kernels/primitive/datamover_primitives.h"
-#include "paddle/phi/kernels/primitive/functor_primitives.h"
 
 #define KPStream gpuStream_t
 #define KPDevice phi::GPUContext
@@ -67,4 +62,22 @@
 #define GRID_NUM_X gridDim.x
 #define GRID_NUM_Y gridDim.y
 #define GRID_NUM_Z gridDim.z
+
+#endif
+
+// include file
+#ifdef PADDLE_WITH_XPU_KP
+
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/kernels/primitive/compute_primitives_xpu2.h"
+#include "paddle/phi/kernels/primitive/datamover_primitives_xpu2.h"
+#include "paddle/phi/kernels/primitive/functor_primitives_xpu2.h"
+
+#else
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/primitive/compute_primitives.h"
+#include "paddle/phi/kernels/primitive/datamover_primitives.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
 #endif
diff --git a/paddle/phi/kernels/psroi_pool_grad_kernel.h b/paddle/phi/kernels/psroi_pool_grad_kernel.h
new file mode 100644
index 0000000000000..87163eb8e079f
--- /dev/null
+++ b/paddle/phi/kernels/psroi_pool_grad_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PsroiPoolGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& rois,
+                         paddle::optional<const DenseTensor&> rois_num,
+                         const DenseTensor& dout,
+                         int pooled_height,
+                         int pooled_width,
+                         int output_channels,
+                         float spatial_scale,
+                         DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/psroi_pool_kernel.h b/paddle/phi/kernels/psroi_pool_kernel.h
new file mode 100644
index 0000000000000..341037af2caec
--- /dev/null
+++ b/paddle/phi/kernels/psroi_pool_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PsroiPoolKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& rois,
+                     paddle::optional<const DenseTensor&> rois_num,
+                     int pooled_height,
+                     int pooled_width,
+                     int output_channels,
+                     float spatial_scale,
+                     DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/qr_kernel.h b/paddle/phi/kernels/qr_kernel.h
new file mode 100644
index 0000000000000..9c3dfb1660126
--- /dev/null
+++ b/paddle/phi/kernels/qr_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void QrKernel(const Context& ctx,
+              const DenseTensor& x,
+              const std::string& mode,
+              DenseTensor* q,
+              DenseTensor* r);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_grad_kernel.h b/paddle/phi/kernels/reduce_grad_kernel.h
new file mode 100644
index 0000000000000..ee6f3d19a094d
--- /dev/null
+++ b/paddle/phi/kernels/reduce_grad_kernel.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceSumGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad);
+
+template <typename T, typename Context>
+void ReduceMeanGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out_grad,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DataType in_dtype,
+                          DataType out_dtype,
+                          DenseTensor* x_grad);
+
+template <typename T, typename Context>
+void ReduceProdGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& out,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DataType in_dtype,
+                          DataType out_dtype,
+                          DenseTensor* x_grad);
+
+template <typename T, typename Context>
+void ReduceMaxGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const DenseTensor& out,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad);
+
+template <typename T, typename Context>
+void ReduceMinGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const DenseTensor& out,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_kernel.cc b/paddle/phi/kernels/reduce_kernel.cc
new file mode 100644
index 0000000000000..7638c782d547d
--- /dev/null
+++ b/paddle/phi/kernels/reduce_kernel.cc
@@ -0,0 +1,165 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SumKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               DataType out_dtype,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  SumRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void MeanKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out) {
+  bool reduce_all = false;
+  MeanRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+template <typename T, typename Context>
+void ProdKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out) {
+  bool reduce_all = false;
+  ProdRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+template <typename T, typename Context>
+void MaxKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  MaxRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+template <typename T, typename Context>
+void MinKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  MinRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+template <typename T, typename Context>
+void AllKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  AllRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+template <typename T, typename Context>
+void AnyKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  AnyRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(
+    mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {}
+
+PD_REGISTER_KERNEL(sum,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SumKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+
+PD_REGISTER_KERNEL(
+    prod, CPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {}
+
+PD_REGISTER_KERNEL(
+    max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(
+    min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {}
+PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+PD_REGISTER_KERNEL(mean,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MeanKernel,
+                   float,
+                   double,
+                   bool,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(sum,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SumKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+
+PD_REGISTER_KERNEL(
+    prod, GPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {}
+
+PD_REGISTER_KERNEL(
+    max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(
+    min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {}
+PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_kernel.h b/paddle/phi/kernels/reduce_kernel.h
new file mode 100644
index 0000000000000..69bcb47bc98ea
--- /dev/null
+++ b/paddle/phi/kernels/reduce_kernel.h
@@ -0,0 +1,153 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace phi {
+template <typename T, typename Context>
+void SumRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DataType out_dtype,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void MeanRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out);
+
+template <typename T, typename Context>
+void ProdRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out);
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void MinRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void AnyRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void AllRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void SumKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               DataType out_dtype,
+               bool keep_dim,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+void MeanKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out);
+
+template <typename T, typename Context>
+void ProdKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out);
+
+template <typename T, typename Context>
+void MaxKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+void MinKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+void AnyKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+void AllKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Mean(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const std::vector<int64_t>& axis,
+                 bool keep_dim) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out);
+  MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor Sum(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& axis,
+                DataType dtype,
+                bool keep_dim) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  SumInferMeta(x, axis, dtype, keep_dim, &meta_out);
+  SumKernel<T, Context>(dev_ctx, x, axis, dtype, keep_dim, &dense_out);
+  return dense_out;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/roi_align_grad_kernel.h b/paddle/phi/kernels/roi_align_grad_kernel.h
new file mode 100644
index 0000000000000..eea1fa03886a4
--- /dev/null
+++ b/paddle/phi/kernels/roi_align_grad_kernel.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiAlignGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& boxes,
+                        paddle::optional<const DenseTensor&> boxes_num,
+                        const DenseTensor& out_grad,
+                        int pooled_height,
+                        int pooled_width,
+                        float spatial_scale,
+                        int sampling_ratio,
+                        bool aligned,
+                        DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/roi_align_kernel.h b/paddle/phi/kernels/roi_align_kernel.h
new file mode 100644
index 0000000000000..9734da53b7f45
--- /dev/null
+++ b/paddle/phi/kernels/roi_align_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiAlignKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& boxes,
+                    paddle::optional<const DenseTensor&> boxes_num,
+                    int pooled_height,
+                    int pooled_width,
+                    float spatial_scale,
+                    int sampling_ratio,
+                    bool aligned,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/roi_pool_grad_kernel.h b/paddle/phi/kernels/roi_pool_grad_kernel.h
new file mode 100644
index 0000000000000..d7f1c378f75c3
--- /dev/null
+++ b/paddle/phi/kernels/roi_pool_grad_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiPooGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& boxes,
+                      paddle::optional<const DenseTensor&> boxes_num,
+                      const DenseTensor& arg_max,
+                      const DenseTensor& out_grad,
+                      int pooled_height,
+                      int pooled_width,
+                      float spatial_scale,
+                      DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/roi_pool_kernel.h b/paddle/phi/kernels/roi_pool_kernel.h
new file mode 100644
index 0000000000000..c6ff6f223612a
--- /dev/null
+++ b/paddle/phi/kernels/roi_pool_kernel.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+static constexpr int kROISize = 4;
+
+template <typename T, typename Context>
+void RoiPoolKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& boxes,
+                   paddle::optional<const DenseTensor&> boxes_num,
+                   int pooled_height,
+                   int pooled_width,
+                   float spatial_scale,
+                   DenseTensor* out,
+                   DenseTensor* arg_max);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/roll_grad_kernel.h b/paddle/phi/kernels/roll_grad_kernel.h
new file mode 100644
index 0000000000000..331f3626e5657
--- /dev/null
+++ b/paddle/phi/kernels/roll_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RollGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& shifts,
+                    const std::vector<int64_t>& axis,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/roll_kernel.h b/paddle/phi/kernels/roll_kernel.h
new file mode 100644
index 0000000000000..56f32174a4c00
--- /dev/null
+++ b/paddle/phi/kernels/roll_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RollKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& shifts,
+                const std::vector<int64_t>& axis,
+                DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/searchsorted_kernel.h b/paddle/phi/kernels/searchsorted_kernel.h
new file mode 100644
index 0000000000000..e425c7fd79555
--- /dev/null
+++ b/paddle/phi/kernels/searchsorted_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SearchsortedKernel(const Context& ctx,
+                        const DenseTensor& sorted_sequence,
+                        const DenseTensor& value,
+                        bool out_int32,
+                        bool right,
+                        DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/segment_pool_grad_kernel.h b/paddle/phi/kernels/segment_pool_grad_kernel.h
new file mode 100644
index 0000000000000..e773eed16e8c8
--- /dev/null
+++ b/paddle/phi/kernels/segment_pool_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SegmentPoolGradKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& segment_ids,
+                           const DenseTensor& out,
+                           paddle::optional<const DenseTensor&> summed_ids,
+                           const DenseTensor& out_grad,
+                           const std::string& pooltype,
+                           DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/segment_pool_kernel.h b/paddle/phi/kernels/segment_pool_kernel.h
new file mode 100644
index 0000000000000..8f7b30c2e8603
--- /dev/null
+++ b/paddle/phi/kernels/segment_pool_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SegmentPoolKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& segment_ids,
+                       const std::string& pooltype,
+                       DenseTensor* out,
+                       DenseTensor* summed_ids);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.cc b/paddle/phi/kernels/selected_rows/assign_kernel.cc
new file mode 100644
index 0000000000000..fae876facfc8f
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/assign_kernel.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selected_rows/assign_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/assign_kernel.h"
+
+namespace phi {
+namespace sr {
+
+// Note: use `const paddle::optional<const SelectedRows&> x`
+// as input if needed
+template <typename Context>
+void AssignKernel(const Context& dev_ctx,
+                  const SelectedRows& x,
+                  SelectedRows* out) {
+  out->set_rows(x.rows());
+  out->set_height(x.height());
+  phi::AssignKernel<Context>(dev_ctx, x.value(), out->mutable_value());
+}
+
+}  // namespace sr
+}  // namespace phi
+
+PD_REGISTER_GENERAL_KERNEL(assign_sr,
+                           CPU,
+                           ALL_LAYOUT,
+                           phi::sr::AssignKernel<phi::CPUContext>,
+                           ALL_DTYPE) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(assign_sr,
+                           GPU,
+                           ALL_LAYOUT,
+                           phi::sr::AssignKernel<phi::GPUContext>,
+                           ALL_DTYPE) {}
+#endif
diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.h b/paddle/phi/kernels/selected_rows/assign_kernel.h
new file mode 100644
index 0000000000000..2ba465615a73a
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/assign_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename Context>
+void AssignKernel(const Context& dev_ctx,
+                  const SelectedRows& x,
+                  SelectedRows* out);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/copy_kernel.cc b/paddle/phi/kernels/selected_rows/copy_kernel.cc
new file mode 100644
index 0000000000000..cf71ab0583f61
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/copy_kernel.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/selected_rows/copy_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+namespace phi {
+namespace sr {
+
+template <typename Context>
+void Copy(const Context& dev_ctx,
+          const SelectedRows& src,
+          Place dst_place,
+          bool blocking,
+          SelectedRows* dst) {
+  if (src.value().Holder() != dst->value().Holder() ||
+      src.value().data() != dst->value().data()) {
+    dst->set_rows(src.rows());
+    dst->set_height(src.height());
+  }
+  phi::Copy<Context>(
+      dev_ctx, src.value(), dst_place, blocking, dst->mutable_value());
+}
+
+}  // namespace sr
+}  // namespace phi
+
+PD_REGISTER_GENERAL_KERNEL(
+    copy_sr, CPU, ALL_LAYOUT, phi::sr::Copy<phi::CPUContext>, ALL_DTYPE) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(
+    copy_sr, GPU, ALL_LAYOUT, phi::sr::Copy<phi::GPUContext>, ALL_DTYPE) {}
+#endif
diff --git a/paddle/phi/kernels/selected_rows/copy_kernel.h b/paddle/phi/kernels/selected_rows/copy_kernel.h
new file mode 100644
index 0000000000000..4aa848bea2a71
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/copy_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+
+namespace phi {
+namespace sr {
+
+template <typename Context>
+void Copy(const Context& dev_ctx,
+          const SelectedRows& src,
+          Place dst_place,
+          bool blocking,
+          SelectedRows* dst);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc
new file mode 100644
index 0000000000000..80b2a1f6678a2
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h"
+
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h"
+
+namespace phi {
+namespace sr {
+
+static std::vector<int64_t> PathToRows(const DenseTensor& path) {
+  std::set<int64_t> rows;
+  const int64_t* paths = path.data<int64_t>();
+  for (int64_t i = 0; i < path.numel(); ++i) {
+    int64_t row = paths[i];
+    if (row < 0) {
+      continue;
+    }
+    rows.emplace(row);
+  }
+  return std::vector<int64_t>(rows.begin(), rows.end());
+}
+
+template <typename T, typename Context>
+void HierarchicalSigmoidGradKernel(const Context& ctx,
+                                   const DenseTensor& x,
+                                   const DenseTensor& w,
+                                   const DenseTensor& label,
+                                   const DenseTensor& pre_out,
+                                   const DenseTensor& out_grad,
+                                   paddle::optional<const DenseTensor&> path,
+                                   paddle::optional<const DenseTensor&> code,
+                                   paddle::optional<const DenseTensor&> bias,
+                                   int num_classes,
+                                   bool remote_prefetch,
+                                   int trainer_id,
+                                   const std::vector<int64_t>& height_sections,
+                                   const std::vector<std::string>& epmap,
+                                   const std::vector<std::string>& table_names,
+                                   bool is_sparse,
+                                   DenseTensor* x_grad,
+                                   SelectedRows* w_grad,
+                                   DenseTensor* bias_grad) {
+  PADDLE_ENFORCE_NOT_NULL(
+      path.get_ptr(),
+      errors::NotFound("Custom tree must be set for sparse mode!"));
+  paddle::framework::Vector<int64_t> real_rows = PathToRows(*path);
+  w_grad->set_rows(real_rows);
+  // Build a map of id -> row_index to speed up finding the index of one id
+  w_grad->set_height(w.dims()[0]);
+  auto* w_grad_value = w_grad->mutable_value();
+  phi::DDim temp_dim(w.dims());
+  temp_dim[0] = real_rows.size();
+  w_grad_value->Resize(temp_dim);
+  phi::HierarchicalSigmoidGradKernelImpl<T>(ctx,
+                                            x,
+                                            w,
+                                            label,
+                                            pre_out,
+                                            out_grad,
+                                            path,
+                                            code,
+                                            bias,
+                                            num_classes,
+                                            remote_prefetch,
+                                            trainer_id,
+                                            height_sections,
+                                            epmap,
+                                            table_names,
+                                            is_sparse,
+                                            x_grad,
+                                            w_grad_value,
+                                            bias_grad,
+                                            w_grad);
+}
+
+}  // namespace sr
+}  // namespace phi
+
+PD_REGISTER_KERNEL(hierarchical_sigmoid_grad_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sr::HierarchicalSigmoidGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h
new file mode 100644
index 0000000000000..557c8b1bc5eed
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void HierarchicalSigmoidGradKernel(const Context& ctx,
+                                   const DenseTensor& x,
+                                   const DenseTensor& w,
+                                   const DenseTensor& label,
+                                   const DenseTensor& pre_out,
+                                   const DenseTensor& out_grad,
+                                   paddle::optional<const DenseTensor&> path,
+                                   paddle::optional<const DenseTensor&> code,
+                                   paddle::optional<const DenseTensor&> bias,
+                                   int num_classes,
+                                   bool remote_prefetch,
+                                   int trainer_id,
+                                   const std::vector<int64_t>& height_sections,
+                                   const std::vector<std::string>& epmap,
+                                   const std::vector<std::string>& table_names,
+                                   bool is_sparse,
+                                   DenseTensor* x_grad,
+                                   SelectedRows* w_grad,
+                                   DenseTensor* bias_grad);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc
new file mode 100644
index 0000000000000..67126d82042b2
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/selected_rows/shape_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/shape_kernel.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const SelectedRows& input,
+                 DenseTensor* out) {
+  phi::ShapeKernel<T, Context>(ctx, input.value(), out);
+}
+
+}  // namespace sr
+}  // namespace phi
+
+PD_REGISTER_KERNEL(shape_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sr::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(shape_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sr::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+#endif
diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.h b/paddle/phi/kernels/selected_rows/shape_kernel.h
new file mode 100644
index 0000000000000..86ba52982b596
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const SelectedRows& input,
+                 DenseTensor* out);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/set_value_grad_kernel.h b/paddle/phi/kernels/set_value_grad_kernel.h
new file mode 100644
index 0000000000000..6a028b0c8dc50
--- /dev/null
+++ b/paddle/phi/kernels/set_value_grad_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SetValueGradKernel(const Context& dev_ctx,
+                        const DenseTensor& out_grad,
+                        const ScalarArray& starts,
+                        const ScalarArray& ends,
+                        const ScalarArray& steps,
+                        const std::vector<int64_t>& axes,
+                        const std::vector<int64_t>& decrease_axes,
+                        const std::vector<int64_t>& none_axes,
+                        DenseTensor* x_grad,
+                        DenseTensor* value_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/shape_kernel.cc b/paddle/phi/kernels/shape_kernel.cc
new file mode 100644
index 0000000000000..dd26a7edc9cdd
--- /dev/null
+++ b/paddle/phi/kernels/shape_kernel.cc
@@ -0,0 +1,67 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/shape_kernel.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const DenseTensor& input,
+                 DenseTensor* out) {
+  auto in_var = &input;
+  phi::DDim in_dims;
+  in_dims = in_var->dims();
+  auto out_t = out;
+  out_t->Resize({in_dims.size()});
+  auto out_data = ctx.template HostAlloc<int32_t>(out_t);
+  for (int i = 0; i < in_dims.size(); ++i) {
+    out_data[i] = in_dims[i];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(shape,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(shape,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/phi/kernels/shape_kernel.h
similarity index 64%
rename from paddle/fluid/operators/reduce_ops/reduce_all_op.cu
rename to paddle/phi/kernels/shape_kernel.h
index a1f1a228aeb3a..444c481812e88 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
+++ b/paddle/phi/kernels/shape_kernel.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
+#pragma once
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_all,
-    ops::ReduceCudaKernel<bool, kps::LogicalAndFunctor, kps::IdentityFunctor>);
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const DenseTensor& input,
+                 DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/shard_index_kernel.h b/paddle/phi/kernels/shard_index_kernel.h
new file mode 100644
index 0000000000000..54ad9a14fa023
--- /dev/null
+++ b/paddle/phi/kernels/shard_index_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ShardIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& in,
+                      int index_num,
+                      int nshards,
+                      int shard_id,
+                      int ignore_value,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/softmax_kernel.h b/paddle/phi/kernels/softmax_kernel.h
index ca69d652770aa..4edd562ca8853 100644
--- a/paddle/phi/kernels/softmax_kernel.h
+++ b/paddle/phi/kernels/softmax_kernel.h
@@ -19,20 +19,10 @@ limitations under the License. */
 
 namespace phi {
 
-template <typename T, typename Context>
-void SoftmaxRawKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      int axis,
-                      DenseTensor* out);
-
 template <typename T, typename Context>
 void SoftmaxKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    int axis,
-                   DataType dtype,
-                   DenseTensor* out) {
-  auto cast_x = phi::Cast<T, Context>(dev_ctx, x, dtype);
-  phi::SoftmaxRawKernel<T, Context>(dev_ctx, axis, out);
-}
+                   DenseTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
index f4265d303d730..23e059c72e776 100644
--- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
@@ -27,11 +27,12 @@ void Conv3dGradKernel(const Context& dev_ctx,
                       const SparseCooTensor& x,
                       const DenseTensor& rulebook,
                       const DenseTensor& kernel,
-                      const SparseCooTensor& out_grad,
+                      const DenseTensor& out_grad,
                       const std::vector<int>& paddings,
                       const std::vector<int>& dilations,
                       const std::vector<int>& strides,
                       const int groups,
+                      const bool subm,
                       DenseTensor* x_grad,
                       DenseTensor* kernel_grad);
 
@@ -40,11 +41,12 @@ std::vector<DenseTensor> Conv3dGrad(const Context& dev_ctx,
                                     const SparseCooTensor& x,
                                     const DenseTensor& rulebook,
                                     const DenseTensor& kernel,
-                                    const SparseCooTensor& out_grad,
+                                    const DenseTensor& out_grad,
                                     const std::vector<int>& paddings,
                                     const std::vector<int>& dilations,
                                     const std::vector<int>& strides,
-                                    const int groups) {
+                                    const int groups,
+                                    const bool subm) {
   DenseTensor x_grad =
       phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
   DenseTensor kernel_grad = phi::Empty<Context>(
@@ -59,6 +61,7 @@ std::vector<DenseTensor> Conv3dGrad(const Context& dev_ctx,
                                dilations,
                                strides,
                                groups,
+                               subm,
                                &x_grad,
                                &kernel_grad);
   std::vector<DenseTensor> out(2);
diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h
index cfb451afdcbcb..ff2cf94edb5a3 100644
--- a/paddle/phi/kernels/sparse/convolution_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_kernel.h
@@ -18,105 +18,11 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
 
 namespace phi {
 namespace sparse {
 
-struct Dims4D {
-  int dims[4];
-  Dims4D(const int batch, const int x, const int y, const int z) {
-    dims[0] = batch;
-    dims[1] = z;
-    dims[2] = y;
-    dims[3] = x;
-  }
-  HOSTDEVICE const int& operator[](int i) const { return dims[i]; }
-};
-
-// Judge whether the current position x is in (lower, upper)
-inline HOSTDEVICE bool Check(const int& x,
-                             const int& kx,
-                             const int& pad,
-                             const int& stride,
-                             const int dilation,
-                             const int kdim,
-                             const int xdim) {
-  const int lower = x - dilation * kx + pad;
-  const int uper = x + (kdim - kx - 1) * dilation - pad;
-  return (lower >= 0 && lower % stride == 0 && uper < xdim);
-}
-
-// Check whether the current position(x, y, z) is legal:
-// Judge the minimum and maximum values at each latitude
-inline HOSTDEVICE bool Check(const Dims4D& dims,
-                             const Dims4D& kernel_dims,
-                             const Dims4D& paddings,
-                             const Dims4D& dilations,
-                             const Dims4D& strides,
-                             const int x,
-                             const int y,
-                             const int z,
-                             const int kx,
-                             const int ky,
-                             const int kz) {
-  bool x_valid = Check(
-      x, kx, paddings[3], strides[3], dilations[3], kernel_dims[3], dims[3]);
-  bool y_valid = Check(
-      y, ky, paddings[2], strides[2], dilations[2], kernel_dims[2], dims[2]);
-  bool z_valid = Check(
-      z, kz, paddings[1], strides[1], dilations[1], kernel_dims[1], dims[1]);
-  return (x_valid && y_valid && z_valid);
-}
-
-template <typename Dim>
-inline HOSTDEVICE int PointToIndex(const int& batch,
-                                   const int& x,
-                                   const int& y,
-                                   const int& z,
-                                   const Dim& dims) {
-  return batch * dims[1] * dims[2] * dims[3] + z * dims[2] * dims[3] +
-         y * dims[3] + x;
-}
-
-template <typename Dim>
-inline HOSTDEVICE void IndexToPoint(
-    const int index, const Dim& dims, int* batch, int* x, int* y, int* z) {
-  int n = index;
-  *x = n % dims[3];
-  n /= dims[3];
-  *y = n % dims[2];
-  n /= dims[2];
-  *z = n % dims[1];
-  n /= dims[1];
-  *batch = n;
-}
-
-inline void GetOutShape(const DDim& x_dims,
-                        const DDim& kernel_dims,
-                        const std::vector<int>& paddings,
-                        const std::vector<int>& dilations,
-                        const std::vector<int>& strides,
-                        DDim* out_dims) {
-  PADDLE_ENFORCE_EQ(
-      x_dims.size(),
-      5,
-      phi::errors::InvalidArgument("the shape of x should be (N, D, H, W, C)"));
-  PADDLE_ENFORCE_EQ(kernel_dims.size(),
-                    5,
-                    phi::errors::InvalidArgument(
-                        "the shape of kernel should be (D, H, W, C, OC)"));
-
-  // infer out shape
-  (*out_dims)[0] = x_dims[0];
-  (*out_dims)[4] = kernel_dims[4];
-  for (int i = 1; i < 4; i++) {
-    (*out_dims)[i] = (x_dims[i] + 2 * paddings[i - 1] -
-                      dilations[i - 1] * (kernel_dims[i - 1] - 1) - 1) /
-                         strides[i - 1] +
-                     1;
-  }
-}
-
 template <typename T, typename Context>
 void Conv3dKernel(const Context& dev_ctx,
                   const SparseCooTensor& x,
@@ -125,6 +31,7 @@ void Conv3dKernel(const Context& dev_ctx,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const int groups,
+                  const bool subm,
                   SparseCooTensor* out,
                   DenseTensor* rulebook);
 
@@ -136,14 +43,23 @@ SparseCooTensor Conv3d(const Context& dev_ctx,
                        const std::vector<int>& dilations,
                        const std::vector<int>& strides,
                        const int groups,
+                       const bool subm,
                        DenseTensor* rulebook) {
   DenseTensor indices = phi::Empty<Context>(
       dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
   DenseTensor values =
       phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
   SparseCooTensor coo(indices, values, x.dims());
-  Conv3dKernel<T, Context>(
-      dev_ctx, x, kernel, paddings, dilations, strides, groups, &coo, rulebook);
+  Conv3dKernel<T, Context>(dev_ctx,
+                           x,
+                           kernel,
+                           paddings,
+                           dilations,
+                           strides,
+                           groups,
+                           subm,
+                           &coo,
+                           rulebook);
   return coo;
 }
 
diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h
index bcb6db407883f..93a335e2f1c35 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution.h
+++ b/paddle/phi/kernels/sparse/cpu/convolution.h
@@ -16,8 +16,6 @@ limitations under the License. */
 
 #include <set>
 
-#include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
@@ -28,42 +26,59 @@ limitations under the License. */
 namespace phi {
 namespace sparse {
 
+using Dims4D = phi::funcs::sparse::Dims4D;
+
 // such as: kernel(3, 3, 3), kernel_size = 27
 // counter_per_weight: (kernel_size)
 // TODO(zhangkaihuo): optimize performance with multithreading
 template <typename T, typename Context>
 void ProductRuleBook(const Context& dev_ctx,
                      const SparseCooTensor& x,
-                     const DenseTensor& kernel,
+                     const std::vector<int>& kernel_sizes,
                      const std::vector<int>& paddings,
                      const std::vector<int>& dilations,
                      const std::vector<int>& strides,
                      const DDim& out_dims,
+                     const bool subm,
                      DenseTensor* rulebook,
                      DenseTensor* counter_per_kernel) {
-  const auto& kernel_dims = kernel.dims();
   const int64_t non_zero_num = x.nnz();
   const auto& non_zero_indices = x.non_zero_indices();
   const int* indices_ptr = non_zero_indices.data<int>();
   int* counter_ptr = counter_per_kernel->data<int>();
-  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
   memset(counter_ptr, 0, kernel_size * sizeof(int));
 
   int rulebook_len = 0;
   // calc the rulebook_len
   const auto& x_dims = x.dims();
   const Dims4D c_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
-  const Dims4D c_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]);
+  const Dims4D c_kernel_dims(
+      1, kernel_sizes[2], kernel_sizes[1], kernel_sizes[0]);
   const Dims4D c_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
   const Dims4D c_paddings(1, paddings[2], paddings[1], paddings[0]);
   const Dims4D c_strides(1, strides[2], strides[1], strides[0]);
   const Dims4D c_dilations(1, dilations[2], dilations[1], dilations[0]);
 
+  std::set<int> hash_in;
+  if (subm) {
+    for (int i = 0; i < non_zero_num; i++) {
+      int batch = indices_ptr[i];
+      int in_z = indices_ptr[i + non_zero_num];
+      int in_y = indices_ptr[i + 2 * non_zero_num];
+      int in_x = indices_ptr[i + 3 * non_zero_num];
+      int index = phi::funcs::sparse::PointToIndex<DDim>(
+          batch, in_x, in_y, in_z, x_dims);
+      hash_in.insert(index);
+    }
+  }
+
   auto f_calc_rulebook = [&](int* rulebook_ptr) {
     int kernel_index = 0, rulebook_index = 0;
-    for (int kz = 0; kz < kernel_dims[0]; kz++) {
-      for (int ky = 0; ky < kernel_dims[1]; ky++) {
-        for (int kx = 0; kx < kernel_dims[2]; kx++) {
+    for (int kz = 0; kz < kernel_sizes[0]; kz++) {
+      for (int ky = 0; ky < kernel_sizes[1]; ky++) {
+        for (int kx = 0; kx < kernel_sizes[2]; kx++) {
+          ++kernel_index;
           for (int64_t i = 0; i < non_zero_num; i++) {
             int batch = indices_ptr[i];
             int in_z = indices_ptr[i + non_zero_num];
@@ -72,31 +87,38 @@ void ProductRuleBook(const Context& dev_ctx,
             int out_z = (in_z + paddings[0] - kz * dilations[0]) / strides[0];
             int out_y = (in_y + paddings[1] - ky * dilations[1]) / strides[1];
             int out_x = (in_x + paddings[2] - kx * dilations[2]) / strides[2];
-            if (Check(c_x_dims,
-                      c_kernel_dims,
-                      c_paddings,
-                      c_dilations,
-                      c_strides,
-                      in_x,
-                      in_y,
-                      in_z,
-                      kx,
-                      ky,
-                      kz)) {
+            if (phi::funcs::sparse::Check(c_x_dims,
+                                          c_kernel_dims,
+                                          c_paddings,
+                                          c_dilations,
+                                          c_strides,
+                                          in_x,
+                                          in_y,
+                                          in_z,
+                                          kx,
+                                          ky,
+                                          kz)) {
+              if (subm) {
+                int out_index = phi::funcs::sparse::PointToIndex<DDim>(
+                    batch, out_x, out_y, out_z, out_dims);
+                if (hash_in.find(out_index) == hash_in.end()) {
+                  continue;
+                }
+              }
+
               if (rulebook_ptr == nullptr) {
-                counter_ptr[kernel_index] += 1;
+                counter_ptr[kernel_index - 1] += 1;
                 ++rulebook_len;
               } else {
-                rulebook_ptr[rulebook_index] = kernel_index;
+                rulebook_ptr[rulebook_index] = kernel_index - 1;
                 rulebook_ptr[rulebook_index + rulebook_len] = i;  // in_i
                 rulebook_ptr[rulebook_index + rulebook_len * 2] =
-                    PointToIndex<DDim>(
+                    phi::funcs::sparse::PointToIndex<DDim>(
                         batch, out_x, out_y, out_z, out_dims);  // out_index
                 ++rulebook_index;
               }
             }
           }
-          ++kernel_index;
         }
       }
     }
@@ -140,7 +162,7 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx,
   for (auto it = out_indexs.begin(); it != out_indexs.end(); it++, i++) {
     const int index = *it;
     int batch, x, y, z;
-    IndexToPoint<DDim>(index, out_dims, &batch, &x, &y, &z);
+    phi::funcs::sparse::IndexToPoint<DDim>(index, out_dims, &batch, &x, &y, &z);
     out_indices_ptr[i] = batch;
     out_indices_ptr[i + out_non_zero_num] = z;
     out_indices_ptr[i + out_non_zero_num * 2] = y;
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
index 6ee265a329673..3348d81cf6b4b 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
@@ -33,11 +33,12 @@ void Conv3dGradKernel(const Context& dev_ctx,
                       const SparseCooTensor& x,
                       const DenseTensor& rulebook,
                       const DenseTensor& kernel,
-                      const SparseCooTensor& out_grad,
+                      const DenseTensor& out_grad,
                       const std::vector<int>& paddings,
                       const std::vector<int>& dilations,
                       const std::vector<int>& strides,
                       const int groups,
+                      const bool subm,
                       DenseTensor* x_grad,
                       DenseTensor* kernel_grad) {
   const auto& kernel_dims = kernel.dims();
@@ -70,32 +71,57 @@ void Conv3dGradKernel(const Context& dev_ctx,
   T* d_kernel_ptr = kernel_grad->data<T>();
   memset(d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel());
 
-  Gather<T>(x.non_zero_elements().data<T>(),
-            rulebook_ptr + rulebook_len,
-            rulebook_len,
-            in_channels,
-            in_features_ptr);
-  Gather<T>(out_grad.non_zero_elements().data<T>(),
-            rulebook_ptr + rulebook_len * 2,
-            rulebook_len,
-            out_channels,
-            out_grad_features_ptr);
-
+  int half_kernel_size = kernel_size / 2;
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  x_grad->Resize(x.non_zero_elements().dims());
+  dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel());
+  T* x_grad_values_ptr = x_grad->data<T>();
+  memset(x_grad_values_ptr, 0, sizeof(T) * x_grad->numel());
+  memset(d_x_features_ptr, 0, sizeof(T) * d_x_features.numel());
+
   std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0);
   for (int i = 0; i < rulebook_len; i++) {
     counter[rulebook_ptr[i]] += 1;
   }
-  int offset = 0;
+  int offset = 0, max_count = 0;
   for (int i = 0; i < kernel_size; i++) {
     offsets[i] = offset;
     offset += counter[i];
+    if (i < half_kernel_size) {
+      max_count = std::max(max_count, counter[i]);
+    }
   }
   offsets[kernel_size] = offset;
 
+  if (subm) {
+    phi::funcs::sparse::SubmPreProcess<T, Context>(dev_ctx,
+                                                   x,
+                                                   kernel,
+                                                   out_grad,
+                                                   in_channels,
+                                                   out_channels,
+                                                   half_kernel_size,
+                                                   kernel_grad,
+                                                   x_grad);
+    if (max_count == 0) {
+      return;
+    }
+  }
+
+  Gather<T>(x.non_zero_elements().data<T>(),
+            rulebook_ptr + rulebook_len,
+            rulebook_len,
+            in_channels,
+            in_features_ptr);
+  Gather<T>(out_grad.data<T>(),
+            rulebook_ptr + rulebook_len * 2,
+            rulebook_len,
+            out_channels,
+            out_grad_features_ptr);
+
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
-    if (counter[i] <= 0) {
+    if (counter[i] <= 0 || (subm && i == half_kernel_size)) {
       continue;
     }
 
@@ -136,10 +162,6 @@ void Conv3dGradKernel(const Context& dev_ctx,
   }
 
   // 4. scatter
-  x_grad->Resize(x.non_zero_elements().dims());
-  dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel());
-  T* x_grad_values_ptr = x_grad->data<T>();
-  memset(x_grad_values_ptr, 0, sizeof(T) * x_grad->numel());
   Scatter<T>(d_x_features_ptr,
              rulebook.data<int>() + rulebook_len,
              rulebook_len,
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
index 64ef068e03ab5..f022e4ef4bb63 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/cpu/convolution.h"
-#include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -35,6 +33,7 @@ void Conv3dKernel(const Context& dev_ctx,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const int groups,
+                  const bool subm,
                   SparseCooTensor* out,
                   DenseTensor* rulebook) {
   // update padding and dilation
@@ -45,10 +44,21 @@ void Conv3dKernel(const Context& dev_ctx,
   const auto& kernel_dims = kernel.dims();
   int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
   DDim out_dims = {1, 1, 1, 1, 1};
-  GetOutShape(x_dims, kernel_dims, paddings, dilations, strides, &out_dims);
+  std::vector<int> kernel_sizes(kernel_dims.size());
+  for (int i = 0; i < kernel_dims.size(); i++) {
+    kernel_sizes[i] = kernel_dims[i];
+  }
+
+  phi::funcs::sparse::GetOutShape(
+      x_dims, kernel_sizes, paddings, dilations, strides, &out_dims);
   const int in_channels = kernel_dims[3];
   const int out_channels = kernel_dims[4];
 
+  std::vector<int> subm_paddings(paddings), subm_strides(strides);
+  if (subm) {
+    phi::funcs::sparse::ResetSubmKernelSizeAndStrides(
+        kernel.dims(), &subm_paddings, &subm_strides);
+  }
   // Second algorithm:
   // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
   // 1. product rulebook
@@ -58,11 +68,12 @@ void Conv3dKernel(const Context& dev_ctx,
 
   ProductRuleBook<T, Context>(dev_ctx,
                               x,
-                              kernel,
-                              paddings,
+                              kernel_sizes,
+                              subm_paddings,
                               dilations,
-                              strides,
+                              subm_strides,
                               out_dims,
+                              subm,
                               rulebook,
                               &counter_per_kernel);
 
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
new file mode 100644
index 0000000000000..3010d480b55c9
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void MaxPoolGradKernel(const Context& dev_ctx,
+                       const SparseCooTensor& x,
+                       const DenseTensor& rulebook,
+                       const SparseCooTensor& out,
+                       const DenseTensor& out_grad,
+                       const std::vector<int>& kernel_sizes,
+                       DenseTensor* x_grad) {
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const int channels = x.dims()[4];
+  int rulebook_len = rulebook.dims()[1];
+  const int* rulebook_ptr = rulebook.data<int>();
+  std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0);
+  for (int i = 0; i < rulebook_len; i++) {
+    counter[rulebook_ptr[i]] += 1;
+  }
+  phi::funcs::sparse::PrefixSum(&counter[0], &offsets[0], kernel_size);
+
+  const T* in_features_ptr = x.non_zero_elements().data<T>();
+  const T* out_features_ptr = out.non_zero_elements().data<T>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  T* x_grad_ptr = x_grad->data<T>();
+  memset(x_grad_ptr, 0, sizeof(T) * x_grad->numel());
+
+  phi::funcs::MaxPoolGrad<T> grad_functor;
+  for (int i = 0; i < kernel_size; i++) {
+    for (int j = 0; j < counter[i]; j++) {
+      int in_i = rulebook_ptr[rulebook_len + offsets[i] + j];
+      int out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j];
+      for (int c = 0; c < channels; c++) {
+        grad_functor.compute(in_features_ptr[in_i * channels + c],
+                             out_features_ptr[out_i * channels + c],
+                             out_grad_ptr[out_i * channels + c],
+                             1,
+                             &x_grad_ptr[in_i * channels + c]);
+      }
+    }
+  }
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_maxpool_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaxPoolGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
new file mode 100644
index 0000000000000..86971242df5ae
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
+#include "paddle/phi/kernels/sparse/cpu/convolution.h"
+
+namespace phi {
+namespace sparse {
+
+/**
+ * x: (N, D, H, W, C)
+ * kernel: (D, H, W, C, OC)
+ * out: (N, D, H, W, OC)
+**/
+template <typename T, typename Context>
+void MaxPoolKernel(const Context& dev_ctx,
+                   const SparseCooTensor& x,
+                   const std::vector<int>& kernel_sizes,
+                   const std::vector<int>& paddings,
+                   const std::vector<int>& dilations,
+                   const std::vector<int>& strides,
+                   SparseCooTensor* out,
+                   DenseTensor* rulebook) {
+  const auto& x_dims = x.dims();
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const std::vector<int>& real_kernel_sizes =
+      phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]);
+  DDim out_dims = {1, 1, 1, 1, 1};
+  phi::funcs::sparse::GetOutShape(
+      x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims);
+  const int in_channels = real_kernel_sizes[3];
+
+  DenseTensorMeta counter_meta(
+      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+
+  const T* in_features_ptr = x.non_zero_elements().data<T>();
+  // 1. product rule book
+  ProductRuleBook<T, Context>(dev_ctx,
+                              x,
+                              real_kernel_sizes,
+                              paddings,
+                              dilations,
+                              strides,
+                              out_dims,
+                              false,
+                              rulebook,
+                              &counter_per_kernel);
+
+  UpdateRulebookAndOutIndex<T>(
+      dev_ctx, x, kernel_size, in_channels, out_dims, rulebook, out);
+
+  int rulebook_len = rulebook->dims()[1];
+  const int* rulebook_ptr = rulebook->data<int>();
+  const int* counter_ptr = counter_per_kernel.data<int>();
+
+  std::vector<int> offsets(kernel_size + 1);
+  phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size);
+  std::vector<bool> out_flags(out->nnz(), false);
+
+  // 2. max pool
+  T* out_features_ptr = out->mutable_non_zero_elements()->data<T>();
+  phi::funcs::MaxPool<T> max_pool_functor;
+  for (int i = 0; i < kernel_size; i++) {
+    for (int j = 0; j < counter_ptr[i]; j++) {
+      int in_i = rulebook_ptr[rulebook_len + offsets[i] + j];
+      int out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j];
+      if (!out_flags[out_i]) {
+        out_flags[out_i] = true;
+        memcpy(&out_features_ptr[out_i * in_channels],
+               &in_features_ptr[in_i * in_channels],
+               in_channels * sizeof(T));
+      } else {
+        for (int c = 0; c < in_channels; c++) {
+          max_pool_functor.compute(in_features_ptr[in_i * in_channels + c],
+                                   &out_features_ptr[out_i * in_channels + c]);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_maxpool,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaxPoolKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
index ba89135641e0e..50e95ee0b8a48 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/sparse/common_shape.h"
 
 namespace phi {
 namespace sparse {
@@ -71,7 +71,8 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
   int64_t non_zero_num = GetNonZeroNum<T>(x, sparse_dim);
 
   const auto place = dev_ctx.GetPlace();
-  const auto values_dims = InferDenseDims(x_dims, sparse_dim, non_zero_num);
+  const auto values_dims =
+      phi::funcs::sparse::InferDenseDims(x_dims, sparse_dim, non_zero_num);
   DenseTensorMeta indices_meta(DataType::INT64,
                                {sparse_dim, static_cast<int64_t>(non_zero_num)},
                                DataLayout::NCHW);
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 03a6aaa68943d..5b928817f64d7 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -23,11 +23,15 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/primitive/compute_primitives.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
 
 namespace phi {
 namespace sparse {
 
+using Dims4D = phi::funcs::sparse::Dims4D;
+
 // TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace
 // this kernel with phi::GatherCUDAKernel;
 // Vectorization can be used to improve read and write bandwidth
@@ -71,7 +75,8 @@ __global__ void ScatterKernel(const T* input,
                               const int non_zero_num,
                               const int rulebook_len,
                               const int channels,
-                              T* out) {
+                              T* out,
+                              const bool subm = false) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
     int indices_i = i / channels;
@@ -82,6 +87,9 @@ __global__ void ScatterKernel(const T* input,
                                             : unique_value[indices_i + 1];
     // max(end-start) = kernel_size
     T sum = static_cast<T>(0);
+    if (subm) {
+      sum = out[indices_i * channels + channels_i];
+    }
     for (int j = start; j < end; j++) {
       const int out_feature_i = out_index[j];
       sum += input[out_feature_i * channels + channels_i];
@@ -135,5 +143,494 @@ inline int* SortedAndUniqueIndex(const Context& dev_ctx,
   return new_end.first;
 }
 
+template <typename T>
+__global__ void SetFlagAndUpdateCounterKernel(const int* indexs,
+                                              const int n,
+                                              const int rulebook_len,
+                                              const int kernel_size,
+                                              T* rulebook_ptr,
+                                              int* counter_ptr) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ int cache_count[];  // kernel_size
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    cache_count[i] = 0;
+  }
+  __syncthreads();
+
+  for (int i = tid; i < n; i += gridDim.x * blockDim.x) {
+    int index = indexs[i];
+    int kernel_index = rulebook_ptr[index];
+    rulebook_ptr[index + rulebook_len] = -1;
+    rulebook_ptr[index + 2 * rulebook_len] = -1;
+    rulebook_ptr[index] = -1;
+    atomicAdd(&cache_count[kernel_index], 1);
+  }
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    atomicSub(&counter_ptr[i], cache_count[i]);
+  }
+}
+
+/**
+ * @brief: update the out index and indices
+ * unique_keys: save the index of the output feature list
+ * unique_values: indiates the index of key before deduplication
+ * out_indexs: indicates the position of the output index in the rulebook
+ * rulebook_len: indicates the length of rulebook
+ * out_dims: indicates the output dims
+ * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys)
+ * rulebook_out_indexs: the output index in rulebook
+**/
+template <typename T>
+__global__ void UpdateIndexKernel(const int* unique_keys,
+                                  const int* unique_values,
+                                  const int* out_indexs,
+                                  const int non_zero_num,
+                                  const int rulebook_len,
+                                  const Dims4D out_dims,
+                                  T* out_indices,
+                                  T* rulebook_out_indexs) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    const int index = unique_keys[i];
+    int batch, x, y, z;
+    phi::funcs::sparse::IndexToPoint<Dims4D>(
+        index, out_dims, &batch, &x, &y, &z);
+    // get out indices
+    out_indices[i] = batch;
+    out_indices[i + non_zero_num] = z;
+    out_indices[i + non_zero_num * 2] = y;
+    out_indices[i + non_zero_num * 3] = x;
+
+    // update rulebook
+    int start = unique_values[i];
+    int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1];
+    // max(end-start) = kernel_size
+    for (int j = start; j < end; j++) {
+      rulebook_out_indexs[out_indexs[j]] = i;
+    }
+  }
+}
+
+// brief: calculation the distance between start and end
+template <typename T>
+__global__ void DistanceKernel(const T* start, const T* end, int* distance) {
+  if (threadIdx.x == 0) {
+    *distance = end - start;
+  }
+}
+
+/**
+ * @brief product rulebook
+ * for input_i in x_indices:
+ *   if input_i participate in the convolution calculation:
+ *       infer the output_i by input_i and kernel_i
+ *       save output_i
+ *
+ * x_indices: the indices of input features
+ * x_dims: the input dims
+ * kernel_dims: the kernel dims
+ * out_dims: the output dims
+ * non_zero_num: the number of input features
+ * rulebook: the rulebook to save the kernel index, input index and output index
+ * counter: save the number of times each location in the kernel participates in
+ *the caculation
+**/
+template <typename T>
+__global__ void ProductRuleBookKernel(const T* x_indices,
+                                      const Dims4D x_dims,
+                                      const Dims4D kernel_dims,
+                                      const Dims4D out_dims,
+                                      const int64_t non_zero_num,
+                                      const Dims4D paddings,
+                                      const Dims4D dilations,
+                                      const Dims4D strides,
+                                      const bool subm,
+                                      T* rulebook,
+                                      int* counter,
+                                      int* in_indexs) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ int counter_buf[];  // kernel_size
+  const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
+  const int offset = kernel_size * non_zero_num;
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    counter_buf[i] = 0;
+  }
+  __syncthreads();
+
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    int kernel_index = 0;
+    int batch = x_indices[i];
+    int in_z = x_indices[i + non_zero_num];
+    int in_y = x_indices[i + 2 * non_zero_num];
+    int in_x = x_indices[i + 3 * non_zero_num];
+    if (subm) {
+      in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims);
+    }
+    for (int kz = 0; kz < kernel_dims[1]; kz++) {
+      for (int ky = 0; ky < kernel_dims[2]; ky++) {
+        for (int kx = 0; kx < kernel_dims[3]; kx++) {
+          int in_i = -1, out_index = -1, kernel_i = -1;
+          if (phi::funcs::sparse::Check(x_dims,
+                                        kernel_dims,
+                                        paddings,
+                                        dilations,
+                                        strides,
+                                        in_x,
+                                        in_y,
+                                        in_z,
+                                        kx,
+                                        ky,
+                                        kz)) {
+            int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
+            int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
+            int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
+            in_i = i;
+            out_index = phi::funcs::sparse::PointToIndex<Dims4D>(
+                batch, out_x, out_y, out_z, out_dims);
+            atomicAdd(&counter_buf[kernel_index], 1);
+            kernel_i = kernel_index;
+          }
+          rulebook[kernel_index * non_zero_num + i] = kernel_i;
+          rulebook[kernel_index * non_zero_num + offset + i] = in_i;
+          rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index;
+          ++kernel_index;
+        }
+      }
+    }
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    atomicAdd(&counter[i], counter_buf[i]);
+  }
+}
+
+// the basic algorithm can refer to convolution_kernel.cc or
+// the second paper
+// example:
+// 1. the rulebook:
+//  the kernel_index:                       0, 0, 0, 1, 1, 1, 2, 2, ....
+//  the out_index(key):                     20, 30, 33, 30, 33, 20, 25
+// 2. mark the index of out_index(value):   0, 1, 2, 3, 4, 5, 6, ....
+// 3. sorted the (key, value)
+// 4. unique the (key, value):
+//  unique_key:     20, 25, 30, 33
+//  unique_values:  0, 2, 3, 5
+//  the index of unique_values is: 0, 1, 2, 3
+// 5. update the out_index by unique_key, uniqe_value and the index of
+// unique_value:
+//  the new out_index: 0, 2, 3, 2, 3, 0, 1
+template <typename T, typename Context>
+int ProductRuleBook(const Context& dev_ctx,
+                    const SparseCooTensor& x,
+                    const std::vector<int>& kernel_sizes,
+                    const std::vector<int>& paddings,
+                    const std::vector<int>& dilations,
+                    const std::vector<int>& strides,
+                    const DDim& out_dims,
+                    const bool subm,
+                    DenseTensor* rulebook,
+                    DenseTensor* counter_per_kernel,
+                    DenseTensor* offsets_per_kernel,
+                    DenseTensor* out_index,
+                    DenseTensor* unique_key,
+                    DenseTensor* unique_value,
+                    SparseCooTensor* out,
+                    std::vector<int>* h_counter,
+                    std::vector<int>* h_offsets) {
+  const int64_t non_zero_num = x.nnz();
+  const auto& non_zero_indices = x.non_zero_indices();
+  const int* indices_ptr = non_zero_indices.data<int>();
+  DenseTensor in_indexs = phi::Empty<Context>(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
+  int* counter_ptr = counter_per_kernel->data<int>();
+  int* offsets_ptr = offsets_per_kernel->data<int>();
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const int rulebook_rows = 3;
+  const int rulebook_cols = kernel_size * non_zero_num;
+  rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols});
+  int* rulebook_ptr = rulebook->data<int>();
+
+  const auto x_dims = x.dims();
+  Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
+  Dims4D d_kernel_dims(1, kernel_sizes[2], kernel_sizes[1], kernel_sizes[0]);
+  Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
+  Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]);
+  Dims4D d_strides(1, strides[2], strides[1], strides[0]);
+  Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]);
+
+  // 1. product rule book
+  phi::funcs::SetConstant<Context, int> set_zero;
+  set_zero(dev_ctx, counter_per_kernel, 0);
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+
+  ProductRuleBookKernel<int><<<config.block_per_grid.x,
+                               config.thread_per_block.x,
+                               kernel_size * sizeof(int),
+                               dev_ctx.stream()>>>(indices_ptr,
+                                                   d_x_dims,
+                                                   d_kernel_dims,
+                                                   d_out_dims,
+                                                   non_zero_num,
+                                                   d_paddings,
+                                                   d_dilations,
+                                                   d_strides,
+                                                   subm,
+                                                   rulebook_ptr,
+                                                   counter_ptr,
+                                                   in_indexs.data<int>());
+
+// 2. remove -1
+#ifdef PADDLE_WITH_HIP
+  int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                             rulebook_ptr,
+                             rulebook_ptr + rulebook_rows * rulebook_cols,
+                             -1);
+
+  DistanceKernel<int><<<1, 1, 0, dev_ctx.stream()>>>(
+      rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1);
+  int rulebook_len = 0;
+  phi::backends::gpu::GpuMemcpyAsync(
+      &rulebook_len,
+      rulebook_ptr + 3 * kernel_size * non_zero_num - 1,
+      sizeof(int),
+#ifdef PADDLE_WITH_HIP
+      hipMemcpyDeviceToHost,
+#else
+      cudaMemcpyDeviceToHost,
+#endif
+      dev_ctx.stream());
+  rulebook_len /= 3;
+  dev_ctx.Wait();
+
+  if (subm) {
+    // At present, hashtable is not used to map the input and output indexes.
+    // At present, the intermediate output index is generated by normal
+    // convolution,
+    // and then the intermediate output index is subtracted from the input index
+    // to obain the rulebook.
+    // get difference
+    int32_t* A_key_ptr = rulebook_ptr + 2 * rulebook_len;
+    int32_t* B_key_ptr = in_indexs.data<int>();
+    DenseTensor A_val = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+    DenseTensor B_val = phi::Empty<Context>(
+        dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
+    phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+        dev_ctx, &A_val, kps::IdentityFunctor<int>());
+    phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+        dev_ctx, &B_val, kps::IdentityFunctor<int>());
+    DenseTensor key_result = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len + 1}, DataLayout::NCHW));
+    DenseTensor val_result = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+
+#ifdef PADDLE_WITH_HIP
+    thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#else
+    thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                           counter_ptr,
+                           counter_ptr + kernel_size,
+                           offsets_ptr);
+    std::vector<int> offsets(kernel_size, 0);
+    // TODO(zhangkaihuo): used unified memcpy interface
+    phi::backends::gpu::GpuMemcpyAsync(offsets.data(),
+                                       offsets_ptr,
+                                       kernel_size * sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+
+    thrust::pair<int*, int*> end;
+    // Because set_diff does not support duplicate data, set_diff is performed
+    // separately for each segment of data.
+    // TODO(zhangkaihuo): Using hashtable here may get better performance,
+    // further tests ared needed.
+    for (int i = 0; i < kernel_size; i++) {
+      int start = offsets[i];
+      int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1];
+      int* key_result_start = (i == 0 ? key_result.data<int>() : end.first);
+      int* val_result_start = i == 0 ? val_result.data<int>() : end.second;
+      end =
+#ifdef PADDLE_WITH_HIP
+          thrust::set_difference_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+          thrust::set_difference_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                                        A_key_ptr + start,
+                                        A_key_ptr + stop,
+                                        B_key_ptr,
+                                        B_key_ptr + x.nnz(),
+                                        A_val.data<int>() + start,
+                                        B_val.data<int>(),
+                                        key_result_start,
+                                        val_result_start);
+    }
+
+    DistanceKernel<int><<<1, 1, 0, dev_ctx.stream()>>>(
+        key_result.data<int>(),
+        end.first,
+        key_result.data<int>() + rulebook_len);
+    int len = 0;
+    phi::backends::gpu::GpuMemcpyAsync(&len,
+                                       key_result.data<int>() + rulebook_len,
+                                       sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+    // set the diff value = -1, and update counter
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1);
+    SetFlagAndUpdateCounterKernel<int><<<config.block_per_grid.x,
+                                         config.thread_per_block,
+                                         kernel_size * sizeof(int),
+                                         dev_ctx.stream()>>>(
+        val_result.data<int>(),
+        len,
+        rulebook_len,
+        kernel_size,
+        rulebook_ptr,
+        counter_ptr);
+// remove -1
+#ifdef PADDLE_WITH_HIP
+    int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#else
+    int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                               rulebook_ptr,
+                               rulebook_ptr + 3 * rulebook_len,
+                               -1);
+    DistanceKernel<int><<<1, 1, 0, dev_ctx.stream()>>>(
+        rulebook_ptr, last, key_result.data<int>() + rulebook_len);
+    phi::backends::gpu::GpuMemcpyAsync(&rulebook_len,
+                                       key_result.data<int>() + rulebook_len,
+                                       sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+    rulebook_len /= 3;
+  }
+
+#ifdef PADDLE_WITH_HIP
+  thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                         counter_ptr,
+                         counter_ptr + kernel_size,
+                         offsets_ptr);
+
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                     counter_ptr,
+                                     kernel_size * sizeof(int),
+                                     hipMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                     offsets_ptr,
+                                     kernel_size * sizeof(int),
+                                     hipMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                     counter_ptr,
+                                     kernel_size * sizeof(int),
+                                     cudaMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                     offsets_ptr,
+                                     kernel_size * sizeof(int),
+                                     cudaMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+#endif
+  rulebook->Resize({rulebook_rows, rulebook_len});
+
+  // 3. sorted or merge the out index
+  out_index->ResizeAndAllocate({rulebook_len});
+  unique_value->ResizeAndAllocate({rulebook_len});
+  unique_key->ResizeAndAllocate({rulebook_len});
+  int* out_index_ptr = out_index->data<int>();
+  int* unique_value_ptr = unique_value->data<int>();
+  int* unique_key_ptr = unique_key->data<int>();
+
+  int* new_end = SortedAndUniqueIndex(dev_ctx,
+                                      rulebook_ptr + 2 * rulebook_len,
+                                      rulebook_len,
+                                      out_index,
+                                      unique_key,
+                                      unique_value);
+  // thrust::distance doesn't support stream parameters
+  // const int out_non_zero_num = thrust::distance(unique_key_ptr,
+  // new_end.first);
+  DistanceKernel<int><<<1, 1>>>(
+      unique_key_ptr,
+      new_end,
+      rulebook_ptr + rulebook_rows * rulebook_cols - 1);
+  int out_non_zero_num = 0;
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(
+      &out_non_zero_num,
+      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
+      sizeof(int),
+      hipMemcpyDeviceToHost,
+      dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(
+      &out_non_zero_num,
+      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
+      sizeof(int),
+      cudaMemcpyDeviceToHost,
+      dev_ctx.stream());
+#endif
+  dev_ctx.Wait();
+
+  // 5. update out_indices and rulebook by unique_value_ptr
+  const int64_t sparse_dim = 4;
+  DenseTensorMeta indices_meta(
+      DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
+  DenseTensorMeta values_meta(
+      x.dtype(), {out_non_zero_num, kernel_sizes[4]}, x.layout());
+  phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
+  phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
+
+  int* out_indices_ptr = out_indices.data<int>();
+
+  config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
+  UpdateIndexKernel<int><<<config.block_per_grid.x,
+                           config.thread_per_block.x,
+                           0,
+                           dev_ctx.stream()>>>(unique_key_ptr,
+                                               unique_value_ptr,
+                                               out_index_ptr,
+                                               out_non_zero_num,
+                                               rulebook_len,
+                                               d_out_dims,
+                                               out_indices_ptr,
+                                               rulebook_ptr + 2 * rulebook_len);
+  out->SetMember(out_indices, out_values, out_dims, true);
+  return rulebook_len;
+}
+
 }  // namespace sparse
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index 861f18f36e632..4db0a0b0011b5 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -38,11 +38,12 @@ void Conv3dGradKernel(const Context& dev_ctx,
                       const SparseCooTensor& x,
                       const DenseTensor& rulebook,
                       const DenseTensor& kernel,
-                      const SparseCooTensor& out_grad,
+                      const DenseTensor& out_grad,
                       const std::vector<int>& paddings,
                       const std::vector<int>& dilations,
                       const std::vector<int>& strides,
                       const int groups,
+                      const bool subm,
                       DenseTensor* x_grad,
                       DenseTensor* kernel_grad) {
   const auto& kernel_dims = kernel.dims();
@@ -69,37 +70,18 @@ void Conv3dGradKernel(const Context& dev_ctx,
   T* in_features_ptr = in_features.data<T>();
   T* d_x_features_ptr = d_x_features.data<T>();
   T* out_grad_features_ptr = out_grad_features.data<T>();
-  kernel_grad->Resize(kernel_dims);
-  dev_ctx.Alloc(
-      kernel_grad, kernel_grad->dtype(), kernel_grad->numel() * sizeof(T));
+  kernel_grad->ResizeAndAllocate(kernel_dims);
   T* d_kernel_ptr = kernel_grad->data<T>();
   phi::funcs::SetConstant<Context, T> set_zero;
   set_zero(dev_ctx, kernel_grad, static_cast<T>(0.0f));
 
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, rulebook_len * in_channels, 1);
-  GatherKernel<T, int><<<config.block_per_grid.x,
-                         config.thread_per_block.x,
-                         0,
-                         dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                                             rulebook_ptr + rulebook_len,
-                                             in_features_ptr,
-                                             rulebook_len,
-                                             in_channels);
-
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, rulebook_len * out_channels, 1);
-  GatherKernel<T, int><<<config.block_per_grid.x,
-                         config.thread_per_block.x,
-                         0,
-                         dev_ctx.stream()>>>(
-      out_grad.non_zero_elements().data<T>(),
-      rulebook_ptr + rulebook_len * 2,
-      out_grad_features_ptr,
-      rulebook_len,
-      out_channels);
-
+  int half_kernel_size = kernel_size / 2;
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  x_grad->ResizeAndAllocate(x.non_zero_elements().dims());
+  T* x_grad_values_ptr = x_grad->data<T>();
+  set_zero(dev_ctx, x_grad, static_cast<T>(0.0f));
+  set_zero(dev_ctx, &d_x_features, static_cast<T>(0.0f));
+
   std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0),
       h_counter(rulebook_len, 0);
   phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
@@ -117,16 +99,56 @@ void Conv3dGradKernel(const Context& dev_ctx,
   for (int i = 0; i < rulebook_len; i++) {
     counter[h_counter[i]] += 1;
   }
-  int offset = 0;
+  int offset = 0, max_count = 0;
   for (int i = 0; i < kernel_size; i++) {
     offsets[i] = offset;
     offset += counter[i];
+    if (i < half_kernel_size) {
+      max_count = std::max(max_count, counter[i]);
+    }
   }
   offsets[kernel_size] = offset;
 
+  if (subm) {
+    phi::funcs::sparse::SubmPreProcess<T, Context>(dev_ctx,
+                                                   x,
+                                                   kernel,
+                                                   out_grad,
+                                                   in_channels,
+                                                   out_channels,
+                                                   half_kernel_size,
+                                                   kernel_grad,
+                                                   x_grad);
+    if (max_count == 0) {
+      return;
+    }
+  }
+
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, rulebook_len * in_channels, 1);
+  GatherKernel<T, int><<<config.block_per_grid.x,
+                         config.thread_per_block.x,
+                         0,
+                         dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                                             rulebook_ptr + rulebook_len,
+                                             in_features_ptr,
+                                             rulebook_len,
+                                             in_channels);
+
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, rulebook_len * out_channels, 1);
+  GatherKernel<T, int><<<config.block_per_grid.x,
+                         config.thread_per_block.x,
+                         0,
+                         dev_ctx.stream()>>>(out_grad.data<T>(),
+                                             rulebook_ptr + rulebook_len * 2,
+                                             out_grad_features_ptr,
+                                             rulebook_len,
+                                             out_channels);
+
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
-    if (counter[i] <= 0) {
+    if (counter[i] <= 0 || (subm && i == half_kernel_size)) {
       continue;
     }
 
@@ -167,19 +189,11 @@ void Conv3dGradKernel(const Context& dev_ctx,
   }
 
   // 4. scatter
-  x_grad->Resize(x.non_zero_elements().dims());
-  dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel());
-  T* x_grad_values_ptr = x_grad->data<T>();
-
-  DenseTensor out_index = phi::Empty(
-      dev_ctx,
-      DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
-  DenseTensor unique_key = phi::Empty(
-      dev_ctx,
-      DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
-  DenseTensor unique_value = phi::Empty(
-      dev_ctx,
-      DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+  x_grad->ResizeAndAllocate(x.non_zero_elements().dims());
+  DenseTensorMeta index_meta(DataType::INT32, {rulebook_len}, DataLayout::NCHW);
+  DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
 
   SortedAndUniqueIndex(dev_ctx,
                        rulebook_ptr + rulebook_len,
@@ -200,7 +214,8 @@ void Conv3dGradKernel(const Context& dev_ctx,
                                          x.nnz(),
                                          rulebook_len,
                                          in_channels,
-                                         x_grad_values_ptr);
+                                         x_grad_values_ptr,
+                                         subm);
 }
 
 }  // namespace sparse
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 4a533d9d1d5e8..214e689e9370a 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -12,345 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thrust/execution_policy.h>
-#include <thrust/remove.h>
-#include <thrust/sort.h>
-#include <thrust/unique.h>
-
-#include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_info.h"
-#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/primitive/compute_primitives.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
 #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
 
 namespace phi {
 namespace sparse {
 
-/**
- * @brief: update the out index and indices
- * unique_keys: save the index of the output feature list
- * unique_values: indiates the index of key before deduplication
- * out_indexs: indicates the position of the output index in the rulebook
- * rulebook_len: indicates the length of rulebook
- * out_dims: indicates the output dims
- * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys)
- * rulebook_out_indexs: the output index in rulebook
-**/
-__global__ void UpdateIndexKernel(const int* unique_keys,
-                                  const int* unique_values,
-                                  const int* out_indexs,
-                                  const int non_zero_num,
-                                  const int rulebook_len,
-                                  const Dims4D out_dims,
-                                  int* out_indices,
-                                  int* rulebook_out_indexs) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
-    const int index = unique_keys[i];
-    int batch, x, y, z;
-    IndexToPoint<Dims4D>(index, out_dims, &batch, &x, &y, &z);
-    // get out indices
-    out_indices[i] = batch;
-    out_indices[i + non_zero_num] = z;
-    out_indices[i + non_zero_num * 2] = y;
-    out_indices[i + non_zero_num * 3] = x;
-
-    // update rulebook
-    int start = unique_values[i];
-    int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1];
-    // max(end-start) = kernel_size
-    for (int j = start; j < end; j++) {
-      rulebook_out_indexs[out_indexs[j]] = i;
-    }
-  }
-}
-
-/**
- * @brief product rulebook
- * for input_i in x_indices:
- *   if input_i participate in the convolution calculation:
- *       infer the output_i by input_i and kernel_i
- *       save output_i
- *
- * x_indices: the indices of input features
- * x_dims: the input dims
- * kernel_dims: the kernel dims
- * out_dims: the output dims
- * non_zero_num: the number of input features
- * rulebook: the rulebook to save the kernel index, input index and output index
- * counter: save the number of times each location in the kernel participates in
- *the caculation
-**/
-__global__ void ProductRuleBookKernel(const int* x_indices,
-                                      const Dims4D x_dims,
-                                      const Dims4D kernel_dims,
-                                      const Dims4D out_dims,
-                                      const int64_t non_zero_num,
-                                      const Dims4D paddings,
-                                      const Dims4D dilations,
-                                      const Dims4D strides,
-                                      int* rulebook,
-                                      int* counter) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  extern __shared__ int counter_buf[];  // kernel_size
-  const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
-  const int offset = kernel_size * non_zero_num;
-  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
-    counter_buf[i] = 0;
-  }
-  __syncthreads();
-
-  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
-    int kernel_index = 0;
-    for (int kz = 0; kz < kernel_dims[1]; kz++) {
-      for (int ky = 0; ky < kernel_dims[2]; ky++) {
-        for (int kx = 0; kx < kernel_dims[3]; kx++) {
-          int batch = x_indices[i];
-          int in_z = x_indices[i + non_zero_num];
-          int in_y = x_indices[i + 2 * non_zero_num];
-          int in_x = x_indices[i + 3 * non_zero_num];
-          int in_i = -1, out_index = -1, kernel_i = -1;
-          if (Check(x_dims,
-                    kernel_dims,
-                    paddings,
-                    dilations,
-                    strides,
-                    in_x,
-                    in_y,
-                    in_z,
-                    kx,
-                    ky,
-                    kz)) {
-            int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
-            int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
-            int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
-            in_i = i;
-            out_index =
-                PointToIndex<Dims4D>(batch, out_x, out_y, out_z, out_dims);
-            atomicAdd(&counter_buf[kernel_index], 1);
-            kernel_i = kernel_index;
-          }
-          rulebook[kernel_index * non_zero_num + i] = kernel_i;
-          rulebook[kernel_index * non_zero_num + offset + i] = in_i;
-          rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index;
-          ++kernel_index;
-        }
-      }
-    }
-  }
-  __syncthreads();
-  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
-    atomicAdd(&counter[i], counter_buf[i]);
-  }
-}
-
-// brief: calculation the distance between start and end
-__global__ void DistanceKernel(const int* start,
-                               const int* end,
-                               int* distance) {
-  if (threadIdx.x == 0) {
-    *distance = end - start;
-  }
-}
-
-// the basic algorithm can refer to convolution_kernel.cc or
-// the second paper
-// example:
-// 1. the rulebook:
-//  the kernel_index:                       0, 0, 0, 1, 1, 1, 2, 2, ....
-//  the out_index(key):                     20, 30, 33, 30, 33, 20, 25
-// 2. mark the index of out_index(value):   0, 1, 2, 3, 4, 5, 6, ....
-// 3. sorted the (key, value)
-// 4. unique the (key, value):
-//  unique_key:     20, 25, 30, 33
-//  unique_values:  0, 2, 3, 5
-//  the index of unique_values is: 0, 1, 2, 3
-// 5. update the out_index by unique_key, uniqe_value and the index of
-// unique_value:
-//  the new out_index: 0, 2, 3, 2, 3, 0, 1
-template <typename T, typename Context>
-int ProductRuleBook(const Context& dev_ctx,
-                    const SparseCooTensor& x,
-                    const DenseTensor& kernel,
-                    const std::vector<int>& paddings,
-                    const std::vector<int>& dilations,
-                    const std::vector<int>& strides,
-                    const DDim& out_dims,
-                    DenseTensor* rulebook,
-                    DenseTensor* counter_per_kernel,
-                    DenseTensor* offsets_per_kernel,
-                    DenseTensor* out_index,
-                    DenseTensor* unique_key,
-                    DenseTensor* unique_value,
-                    SparseCooTensor* out,
-                    std::vector<int>* h_counter,
-                    std::vector<int>* h_offsets) {
-  const auto& kernel_dims = kernel.dims();
-  const int64_t non_zero_num = x.nnz();
-  const auto& non_zero_indices = x.non_zero_indices();
-  const int* indices_ptr = non_zero_indices.data<int>();
-  int* counter_ptr = counter_per_kernel->data<int>();
-  int* offsets_ptr = offsets_per_kernel->data<int>();
-  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
-  const int rulebook_rows = 3;
-  const int rulebook_cols = kernel_size * non_zero_num;
-  rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols});
-  dev_ctx.Alloc(rulebook, rulebook->dtype(), sizeof(int) * rulebook->numel());
-  int* rulebook_ptr = rulebook->data<int>();
-
-  const auto x_dims = x.dims();
-  Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
-  Dims4D d_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]);
-  Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
-  Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]);
-  Dims4D d_strides(1, strides[2], strides[1], strides[0]);
-  Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]);
-
-  // 1. product rule book
-  phi::funcs::SetConstant<Context, int> set_zero;
-  set_zero(dev_ctx, counter_per_kernel, 0);
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
-
-  ProductRuleBookKernel<<<config.block_per_grid.x,
-                          config.thread_per_block.x,
-                          kernel_size * sizeof(int),
-                          dev_ctx.stream()>>>(indices_ptr,
-                                              d_x_dims,
-                                              d_kernel_dims,
-                                              d_out_dims,
-                                              non_zero_num,
-                                              d_paddings,
-                                              d_dilations,
-                                              d_strides,
-                                              rulebook_ptr,
-                                              counter_ptr);
-
-// 2. remove -1
-#ifdef PADDLE_WITH_HIP
-  int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
-#else
-  int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                             rulebook_ptr,
-                             rulebook_ptr + rulebook_rows * rulebook_cols,
-                             -1);
-
-#ifdef PADDLE_WITH_HIP
-  thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
-#else
-  thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                         counter_ptr,
-                         counter_ptr + kernel_size,
-                         offsets_ptr);
-
-#ifdef PADDLE_WITH_HIP
-  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
-                                     counter_ptr,
-                                     kernel_size * sizeof(int),
-                                     hipMemcpyDeviceToHost,
-                                     dev_ctx.stream());
-  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
-                                     offsets_ptr,
-                                     kernel_size * sizeof(int),
-                                     hipMemcpyDeviceToHost,
-                                     dev_ctx.stream());
-#else
-  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
-                                     counter_ptr,
-                                     kernel_size * sizeof(int),
-                                     cudaMemcpyDeviceToHost,
-                                     dev_ctx.stream());
-  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
-                                     offsets_ptr,
-                                     kernel_size * sizeof(int),
-                                     cudaMemcpyDeviceToHost,
-                                     dev_ctx.stream());
-#endif
-  dev_ctx.Wait();
-  int rulebook_len =
-      (*h_counter)[kernel_size - 1] + (*h_offsets)[kernel_size - 1];
-  rulebook->Resize({rulebook_rows, rulebook_len});
-
-  // 3. sorted or merge the out index
-  out_index->ResizeAndAllocate({rulebook_len});
-  unique_value->ResizeAndAllocate({rulebook_len});
-  unique_key->ResizeAndAllocate({rulebook_len});
-  dev_ctx.Alloc(
-      out_index, out_index->dtype(), sizeof(int) * out_index->numel());
-  int* out_index_ptr = out_index->data<int>();
-  dev_ctx.Alloc(
-      unique_value, unique_value->dtype(), sizeof(int) * unique_value->numel());
-  int* unique_value_ptr = unique_value->data<int>();
-  dev_ctx.Alloc(
-      unique_key, unique_key->dtype(), sizeof(int) * unique_key->numel());
-  int* unique_key_ptr = unique_key->data<int>();
-
-  int* new_end = SortedAndUniqueIndex(dev_ctx,
-                                      rulebook_ptr + 2 * rulebook_len,
-                                      rulebook_len,
-                                      out_index,
-                                      unique_key,
-                                      unique_value);
-  // thrust::distance doesn't support stream parameters
-  // const int out_non_zero_num = thrust::distance(unique_key_ptr,
-  // new_end.first);
-  DistanceKernel<<<1, 1>>>(unique_key_ptr,
-                           new_end,
-                           rulebook_ptr + rulebook_rows * rulebook_cols - 1);
-  int out_non_zero_num = 0;
-#ifdef PADDLE_WITH_HIP
-  phi::backends::gpu::GpuMemcpyAsync(
-      &out_non_zero_num,
-      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
-      sizeof(int),
-      hipMemcpyDeviceToHost,
-      dev_ctx.stream());
-#else
-  phi::backends::gpu::GpuMemcpyAsync(
-      &out_non_zero_num,
-      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
-      sizeof(int),
-      cudaMemcpyDeviceToHost,
-      dev_ctx.stream());
-#endif
-  dev_ctx.Wait();
-
-  // 5. update out_indices and rulebook by unique_value_ptr
-  const int64_t sparse_dim = 4;
-  DenseTensorMeta indices_meta(
-      DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
-  DenseTensorMeta values_meta(
-      x.dtype(), {out_non_zero_num, kernel_dims[4]}, x.layout());
-  phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
-  phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
-
-  int* out_indices_ptr = out_indices.data<int>();
-
-  config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
-  UpdateIndexKernel<<<config.block_per_grid.x,
-                      config.thread_per_block.x,
-                      0,
-                      dev_ctx.stream()>>>(unique_key_ptr,
-                                          unique_value_ptr,
-                                          out_index_ptr,
-                                          out_non_zero_num,
-                                          rulebook_len,
-                                          d_out_dims,
-                                          out_indices_ptr,
-                                          rulebook_ptr + 2 * rulebook_len);
-  out->SetMember(out_indices, out_values, out_dims, true);
-  return rulebook_len;
-}
-
 /**
  * x: (N, D, H, W, C)
  * kernel: (D, H, W, C, OC)
@@ -364,6 +35,7 @@ void Conv3dKernel(const Context& dev_ctx,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const int groups,
+                  const bool subm,
                   SparseCooTensor* out,
                   DenseTensor* rulebook) {
   // update padding and dilation
@@ -374,8 +46,12 @@ void Conv3dKernel(const Context& dev_ctx,
   const auto& kernel_dims = kernel.dims();
   int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
   DDim out_dims = {1, 1, 1, 1, 1};
-  GetOutShape(x_dims, kernel_dims, paddings, dilations, strides, &out_dims);
-  out->set_dims(out_dims);
+  std::vector<int> kernel_sizes(kernel_dims.size());
+  for (int i = 0; i < kernel_dims.size(); i++) {
+    kernel_sizes[i] = kernel_dims[i];
+  }
+  phi::funcs::sparse::GetOutShape(
+      x_dims, kernel_sizes, paddings, dilations, strides, &out_dims);
   const int in_channels = kernel_dims[3];
   const int out_channels = kernel_dims[4];
   std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
@@ -389,20 +65,25 @@ void Conv3dKernel(const Context& dev_ctx,
       DataType::INT32, {kernel_size}, DataLayout::NCHW);
   DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
   DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta));
-  DenseTensor out_index = phi::Empty(
-      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
-  DenseTensor unique_key = phi::Empty(
-      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
-  DenseTensor unique_value = phi::Empty(
-      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+  DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW);
+  DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
+
+  std::vector<int> subm_paddings(paddings), subm_strides(strides);
+  if (subm) {
+    phi::funcs::sparse::ResetSubmKernelSizeAndStrides(
+        kernel.dims(), &subm_paddings, &subm_strides);
+  }
 
   int n = ProductRuleBook<T, Context>(dev_ctx,
                                       x,
-                                      kernel,
-                                      paddings,
+                                      kernel_sizes,
+                                      subm_paddings,
                                       dilations,
-                                      strides,
+                                      subm_strides,
                                       out_dims,
+                                      subm,
                                       rulebook,
                                       &counter_per_kernel,
                                       &offsets_per_kernel,
@@ -428,6 +109,8 @@ void Conv3dKernel(const Context& dev_ctx,
       phi::Empty(dev_ctx, std::move(out_features_meta));
   T* in_features_ptr = in_features.data<T>();
   T* out_features_ptr = out_features.data<T>();
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
 
   auto config =
       phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
new file mode 100644
index 0000000000000..1048dd1be0c01
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
@@ -0,0 +1,120 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
+
+#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T>
+__global__ void MaxPoolGradCudaKernel(const T* in_features_ptr,
+                                      const T* out_features_ptr,
+                                      const T* out_grad_ptr,
+                                      const int* rulebook_ptr,
+                                      const int n,
+                                      const int rulebook_len,
+                                      const int channels,
+                                      T* x_grad_ptr) {
+  phi::funcs::MaxPoolGrad<T> grad_functor;
+  CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) {
+    int real_i = i / channels;
+    int c = i - real_i * channels;
+    int in_i = rulebook_ptr[real_i];
+    int out_i = rulebook_ptr[real_i + rulebook_len];
+    grad_functor.compute(in_features_ptr[in_i * channels + c],
+                         out_features_ptr[out_i * channels + c],
+                         out_grad_ptr[out_i * channels + c],
+                         1,
+                         &x_grad_ptr[in_i * channels + c]);
+  }
+}
+
+template <typename T, typename Context>
+void MaxPoolGradKernel(const Context& dev_ctx,
+                       const SparseCooTensor& x,
+                       const DenseTensor& rulebook,
+                       const SparseCooTensor& out,
+                       const DenseTensor& out_grad,
+                       const std::vector<int>& kernel_sizes,
+                       DenseTensor* x_grad) {
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const int in_channels = x.dims()[4];
+  int rulebook_len = rulebook.dims()[1];
+  const int* rulebook_ptr = rulebook.data<int>();
+  std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0),
+      h_counter(kernel_size);
+  phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
+                                     rulebook_ptr,
+                                     rulebook_len * sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                     hipMemcpyDeviceToHost,
+#else
+                                     cudaMemcpyDeviceToHost,
+#endif
+
+                                     dev_ctx.stream());
+  dev_ctx.Wait();
+  for (int i = 0; i < rulebook_len; i++) {
+    counter[h_counter[i]] += 1;
+  }
+  phi::funcs::sparse::PrefixSum(&counter[0], &offsets[0], kernel_size);
+
+  const T* in_features_ptr = x.non_zero_elements().data<T>();
+  const T* out_features_ptr = out.non_zero_elements().data<T>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  T* x_grad_ptr = x_grad->data<T>();
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, x_grad, static_cast<T>(0.0f));
+
+  for (int i = 0; i < kernel_size; i++) {
+    if (counter[i] <= 0) {
+      continue;
+    }
+
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, counter[i] * in_channels, 1);
+    MaxPoolGradCudaKernel<T><<<config.block_per_grid.x,
+                               config.thread_per_block.x,
+                               0,
+                               dev_ctx.stream()>>>(
+        in_features_ptr,
+        out_features_ptr,
+        out_grad_ptr,
+        rulebook_ptr + offsets[i] + rulebook_len,
+        counter[i],
+        rulebook_len,
+        in_channels,
+        x_grad_ptr);
+  }
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_maxpool_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaxPoolGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
new file mode 100644
index 0000000000000..0f6a0d13b1ddb
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
@@ -0,0 +1,140 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
+#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T>
+__global__ void MaxPoolCudaKernel(const T* in_features_ptr,
+                                  const int* rulebook_ptr,
+                                  const int n,
+                                  const int rulebook_len,
+                                  const int channels,
+                                  T* out_features_ptr) {
+  phi::funcs::MaxPool<T> max_pool_functor;
+  CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) {
+    int real_i = i / channels;
+    int channel_i = i - real_i * channels;
+    int in_i = rulebook_ptr[real_i];
+    int out_i = rulebook_ptr[real_i + rulebook_len];
+    max_pool_functor.compute(in_features_ptr[in_i * channels + channel_i],
+                             &out_features_ptr[out_i * channels + channel_i]);
+  }
+}
+
+/**
+ * x: (N, D, H, W, C)
+ * kernel: (D, H, W, C, OC)
+ * out: (N, D, H, W, OC)
+**/
+template <typename T, typename Context>
+void MaxPoolKernel(const Context& dev_ctx,
+                   const SparseCooTensor& x,
+                   const std::vector<int>& kernel_sizes,
+                   const std::vector<int>& paddings,
+                   const std::vector<int>& dilations,
+                   const std::vector<int>& strides,
+                   SparseCooTensor* out,
+                   DenseTensor* rulebook) {
+  const auto& x_dims = x.dims();
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const std::vector<int>& real_kernel_sizes =
+      phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]);
+  DDim out_dims = {1, 1, 1, 1, 1};
+  phi::funcs::sparse::GetOutShape(
+      x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims);
+  const int in_channels = real_kernel_sizes[3];
+
+  std::vector<int> offsets(kernel_size + 1), counter(kernel_size);
+  DenseTensorMeta counter_meta(
+      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+  DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+  DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW);
+  DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
+
+  // 1. product rulebook
+  int rulebook_len = ProductRuleBook<T, Context>(dev_ctx,
+                                                 x,
+                                                 real_kernel_sizes,
+                                                 paddings,
+                                                 dilations,
+                                                 strides,
+                                                 out_dims,
+                                                 false,
+                                                 rulebook,
+                                                 &counter_per_kernel,
+                                                 &offsets_per_kernel,
+                                                 &out_index,
+                                                 &unique_key,
+                                                 &unique_value,
+                                                 out,
+                                                 &counter,
+                                                 &offsets);
+
+  const int* rulebook_ptr = rulebook->data<int>();
+
+  T* out_features_ptr = out->mutable_non_zero_elements()->data<T>();
+  const T* in_features_ptr = x.non_zero_elements().data<T>();
+// 2. max pool
+#ifdef PADDLE_WITH_HIP
+  thrust::fill(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::fill(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+               out_features_ptr,
+               out_features_ptr + out->non_zero_elements().numel(),
+               static_cast<T>(-FLT_MAX));
+  // TODO(zhangkaihuo) Replacing multiple calls with one kernel may be faster
+  for (int i = 0; i < kernel_size; i++) {
+    if (counter[i] <= 0) {
+      continue;
+    }
+
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, counter[i] * in_channels, 1);
+    MaxPoolCudaKernel<T><<<config.block_per_grid.x,
+                           config.thread_per_block.x,
+                           0,
+                           dev_ctx.stream()>>>(
+        in_features_ptr,
+        rulebook_ptr + offsets[i] + rulebook_len,
+        counter[i],
+        rulebook_len,
+        in_channels,
+        out_features_ptr);
+  }
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_maxpool,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaxPoolKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index 2e741111fb148..8048180e425ea 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -16,8 +16,10 @@ limitations under the License. */
 #include <thrust/remove.h>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/sparse/common_shape.h"
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 
 namespace phi {
@@ -115,14 +117,16 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaMemsetAsync(nums_ptr, 0, sizeof(int), dev_ctx.stream()));
 #endif
-  int grid_size = 1, block_size = 1;
-  GetGpuLaunchConfig1D(dev_ctx, rows, &grid_size, &block_size);
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
 
   auto temp_indexs_meta =
       phi::DenseTensorMeta(DataType::INT32, {rows}, phi::DataLayout::NCHW);
   DenseTensor temp_indexs = phi::Empty(dev_ctx, std::move(temp_indexs_meta));
   int* temp_indexs_ptr = temp_indexs.mutable_data<int>(place);
-  GetNonZeroNums<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+  GetNonZeroNums<<<config.block_per_grid.x,
+                   config.thread_per_block.x,
+                   0,
+                   dev_ctx.stream()>>>(
       x_data, rows, cols, nums_ptr, temp_indexs_ptr);
 #ifdef PADDLE_WITH_HIP
   thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
@@ -167,7 +171,8 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
 
   dev_ctx.Wait();  // wait the copy
 
-  const auto values_dims = InferDenseDims(x_dims, sparse_dim, non_zero_num);
+  const auto values_dims =
+      phi::funcs::sparse::InferDenseDims(x_dims, sparse_dim, non_zero_num);
   DenseTensorMeta indices_meta(DataType::INT64,
                                {sparse_dim, static_cast<int64_t>(non_zero_num)},
                                DataLayout::NCHW);
@@ -184,16 +189,18 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
   T* sparse_data = values.mutable_data<T>(place);
 
   // 3. calc indices by indexs and get values by indexs
-  GetGpuLaunchConfig1D(dev_ctx, non_zero_num, &grid_size, &block_size);
-  GetNonZeroElementsAndIndices<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-      x_data,
-      sparse_dim,
-      cols,
-      d_x_dims.data<int64_t>(),
-      non_zero_num,
-      temp_indexs_ptr,
-      indices_data,
-      sparse_data);
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+  GetNonZeroElementsAndIndices<<<config.block_per_grid.x,
+                                 config.thread_per_block.x,
+                                 0,
+                                 dev_ctx.stream()>>>(x_data,
+                                                     sparse_dim,
+                                                     cols,
+                                                     d_x_dims.data<int64_t>(),
+                                                     non_zero_num,
+                                                     temp_indexs_ptr,
+                                                     indices_data,
+                                                     sparse_data);
   out->SetMember(indices, values, x_dims, true);
 }
 
@@ -263,10 +270,9 @@ void SparseCsrToCooKernel(const Context& dev_ctx,
   int* offsets_ptr = batchs == 1 ? nullptr : offsets.mutable_data<int>(place);
   T* coo_values_data = values.mutable_data<T>(place);
 
-  int grid_size = 1, block_size = 1;
   if (batchs > 1) {
-    GetGpuLaunchConfig1D(dev_ctx, batchs, &grid_size, &block_size);
-    GetBatchSizes<<<grid_size, block_size>>>(
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batchs, 1);
+    GetBatchSizes<<<config.block_per_grid.x, config.thread_per_block.x>>>(
         csr_crows_data, rows, batchs, offsets_ptr);
 
 #ifdef PADDLE_WITH_HIP
@@ -279,9 +285,10 @@ void SparseCsrToCooKernel(const Context& dev_ctx,
                            offsets_ptr);
   }
 
-  GetGpuLaunchConfig1D(dev_ctx, rows, &grid_size, &block_size);
-  dim3 grids(grid_size, batchs, 1);
-  ConvertCsrCrowsToCooRows<<<grids, block_size>>>(
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
+  config.block_per_grid.y = batchs;
+  ConvertCsrCrowsToCooRows<<<config.block_per_grid,
+                             config.thread_per_block.x>>>(
       csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows);
 
 #ifdef PADDLE_WITH_HIP
@@ -404,21 +411,29 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
     // TODO(zhangkahuo): call coalesced() to distinct and sort the indices
   }
 
-  int grid_size = 1, block_size = 1;
-  GetGpuLaunchConfig1D(dev_ctx, batchs, &grid_size, &block_size);
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batchs, 1);
   if (batchs > 1) {
     DenseTensorMeta batchs_meta(DataType::INT64, {batchs}, DataLayout::NCHW);
     phi::DenseTensor batchs_offset(
         phi::make_intrusive<paddle::experimental::SharedStorage>(place),
         std::move(batchs_meta));
     int64_t* batchs_offset_ptr = batchs_offset.mutable_data<int64_t>(place);
-    GetBatchsOffset<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+    GetBatchsOffset<<<config.block_per_grid.x,
+                      config.thread_per_block.x,
+                      0,
+                      dev_ctx.stream()>>>(
         batchs_ptr, non_zero_num, batchs_offset_ptr);
-    dim3 grids(grid_size, batchs, 1);
-    ConvertCooRowsToCsrCrows<<<grids, block_size, 0, dev_ctx.stream()>>>(
+    config.block_per_grid.y = batchs;
+    ConvertCooRowsToCsrCrows<<<config.block_per_grid,
+                               config.thread_per_block.x,
+                               0,
+                               dev_ctx.stream()>>>(
         batchs_offset_ptr, coo_rows_data, csr_crows_data, rows, non_zero_num);
   } else {
-    ConvertCooRowsToCsrCrows<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+    ConvertCooRowsToCsrCrows<<<config.block_per_grid.x,
+                               config.thread_per_block.x,
+                               0,
+                               dev_ctx.stream()>>>(
         nullptr, coo_rows_data, csr_crows_data, rows, non_zero_num);
   }
 
@@ -522,12 +537,13 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaMemsetAsync(out_data, 0, sizeof(T) * out->numel(), dev_ctx.stream()));
 #endif
-  int grid_size = 1, block_size = 1;
-  GetGpuLaunchConfig1D(dev_ctx, non_zero_num, &grid_size, &block_size);
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
 
-  KernelSparseCooToDense<
-      T,
-      int64_t><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+  KernelSparseCooToDense<T, int64_t><<<config.block_per_grid.x,
+                                       config.thread_per_block.x,
+                                       0,
+                                       dev_ctx.stream()>>>(
       indices.data<int64_t>(),
       d_sparse_offsets.data<int64_t>(),
       x_data,
diff --git a/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h b/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h
new file mode 100644
index 0000000000000..572ade76281bc
--- /dev/null
+++ b/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void MaxPoolGradKernel(const Context& dev_ctx,
+                       const SparseCooTensor& x,
+                       const DenseTensor& rulebook,
+                       const SparseCooTensor& out,
+                       const DenseTensor& out_grad,
+                       const std::vector<int>& kernel_sizes,
+                       DenseTensor* x_grad);
+
+template <typename T, typename Context>
+DenseTensor MaxPoolGrad(const Context& dev_ctx,
+                        const SparseCooTensor& x,
+                        const DenseTensor& rulebook,
+                        const SparseCooTensor& out,
+                        const DenseTensor& out_grad,
+                        const std::vector<int>& kernel_sizes) {
+  DenseTensor x_grad = phi::Empty<Context>(
+      dev_ctx,
+      DenseTensorMeta(x.dtype(), x.non_zero_elements().dims(), x.layout()));
+  MaxPoolGradKernel<T, Context>(
+      dev_ctx, x, rulebook, out, out_grad, kernel_sizes, &x_grad);
+  return x_grad;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/sparse_pool_kernel.h b/paddle/phi/kernels/sparse/sparse_pool_kernel.h
new file mode 100644
index 0000000000000..bfadbf72e300f
--- /dev/null
+++ b/paddle/phi/kernels/sparse/sparse_pool_kernel.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void MaxPoolKernel(const Context& dev_ctx,
+                   const SparseCooTensor& x,
+                   const std::vector<int>& kernel_sizes,
+                   const std::vector<int>& paddings,
+                   const std::vector<int>& dilations,
+                   const std::vector<int>& strides,
+                   SparseCooTensor* out,
+                   DenseTensor* rulebook);
+
+template <typename T, typename Context>
+SparseCooTensor MaxPool(const Context& dev_ctx,
+                        const SparseCooTensor& x,
+                        const std::vector<int>& kernel_sizes,
+                        const std::vector<int>& paddings,
+                        const std::vector<int>& dilations,
+                        const std::vector<int>& strides,
+                        DenseTensor* rulebook) {
+  DenseTensor indices = phi::Empty<Context>(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+  DenseTensor values =
+      phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
+  SparseCooTensor coo(indices, values, x.dims());
+  MaxPoolKernel<T, Context>(
+      dev_ctx, x, kernel_sizes, paddings, dilations, strides, &coo, rulebook);
+  return coo;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index c83b2130ed455..da05eb3d3cf76 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -23,37 +23,6 @@ limitations under the License. */
 namespace phi {
 namespace sparse {
 
-inline const DDim InferDenseDims(const DDim& x_dims,
-                                 const int64_t sparse_dim,
-                                 const int64_t non_zero_num) {
-  auto dense_dim = x_dims.size() - sparse_dim;
-  DDim values_dims;
-  if (dense_dim) {
-    std::vector<int64_t> dense_dim_vec(dense_dim + 1);
-    dense_dim_vec[0] = non_zero_num;
-    memcpy(&dense_dim_vec[1],
-           x_dims.Get() + sparse_dim,
-           dense_dim * sizeof(x_dims[0]));
-    values_dims = phi::make_ddim(dense_dim_vec);
-  } else {
-    values_dims = phi::make_ddim({non_zero_num});
-  }
-  return values_dims;
-}
-
-template <typename Context>
-inline void GetGpuLaunchConfig1D(const Context& dev_ctx,
-                                 const int64_t n,
-                                 int* grid_size,
-                                 int* block_size) {
-  const int MAX_BLOCK_DIM = dev_ctx.GetMaxThreadsPerBlock();
-  const int MAX_GRID_DIM = dev_ctx.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
-  *block_size = (n >= MAX_BLOCK_DIM) ? MAX_BLOCK_DIM
-                                     : (1 << static_cast<int>(std::log2(n)));
-  *grid_size = n / *block_size;
-  *grid_size = (*grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : *grid_size;
-}
-
 template <typename T, typename Context>
 void DenseToSparseCooKernel(const Context& dev_ctx,
                             const DenseTensor& x,
diff --git a/paddle/phi/kernels/tile_grad_kernel.h b/paddle/phi/kernels/tile_grad_kernel.h
new file mode 100644
index 0000000000000..830276c28e053
--- /dev/null
+++ b/paddle/phi/kernels/tile_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+namespace phi {
+
+template <typename T, typename Context>
+void TileGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& repeat_times,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/tile_kernel.h b/paddle/phi/kernels/tile_kernel.h
new file mode 100644
index 0000000000000..924d0149fe345
--- /dev/null
+++ b/paddle/phi/kernels/tile_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+namespace phi {
+
+template <typename T, typename Context>
+void TileKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& repeat_times,
+                DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/top_k_grad_kernel.h b/paddle/phi/kernels/top_k_grad_kernel.h
new file mode 100644
index 0000000000000..f577b982c575d
--- /dev/null
+++ b/paddle/phi/kernels/top_k_grad_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TopkGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_grad,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    int k,
+                    int axis,
+                    bool largest,
+                    bool sorted,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/top_k_kernel.h b/paddle/phi/kernels/top_k_kernel.h
new file mode 100644
index 0000000000000..fea76e448b543
--- /dev/null
+++ b/paddle/phi/kernels/top_k_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TopkKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const Scalar& k_scalar,
+                int axis,
+                bool largest,
+                bool sorted,
+                DenseTensor* out,
+                DenseTensor* indices);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/tril_triu_grad_kernel.h b/paddle/phi/kernels/tril_triu_grad_kernel.h
new file mode 100644
index 0000000000000..10faf5c48d5bf
--- /dev/null
+++ b/paddle/phi/kernels/tril_triu_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TrilTriuGradKernel(const Context& ctx,
+                        const DenseTensor& out_grad,
+                        int diagonal,
+                        bool lower,
+                        DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/tril_triu_kernel.h b/paddle/phi/kernels/tril_triu_kernel.h
new file mode 100644
index 0000000000000..4daa84e25c373
--- /dev/null
+++ b/paddle/phi/kernels/tril_triu_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TrilTriuKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    int diagonal,
+                    bool lower,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/viterbi_decode_kernel.h b/paddle/phi/kernels/viterbi_decode_kernel.h
new file mode 100644
index 0000000000000..27eb94d89cec4
--- /dev/null
+++ b/paddle/phi/kernels/viterbi_decode_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ViterbiDecodeKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& transition,
+                         const DenseTensor& length,
+                         bool include_bos_eos_tag,
+                         DenseTensor* scores,
+                         DenseTensor* path);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/where_index_kernel.h b/paddle/phi/kernels/where_index_kernel.h
new file mode 100644
index 0000000000000..68b094637c8d5
--- /dev/null
+++ b/paddle/phi/kernels/where_index_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& condition,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/xpu/where_index_kernel.cc b/paddle/phi/kernels/xpu/where_index_kernel.cc
new file mode 100644
index 0000000000000..f6653e57f6ead
--- /dev/null
+++ b/paddle/phi/kernels/xpu/where_index_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_index_kernel.h"
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& condition,
+                      DenseTensor* out) {
+  const T* cond_data = condition.data<T>();
+  auto numel = condition.numel();
+  auto dims = condition.dims();
+  const int rank = dims.size();
+
+  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+  int* true_num = RAII_GUARD.alloc_l3_or_gm<int32_t>(1);
+  int true_num_cpu;
+  int ret = xpu::nonzero_count(dev_ctx.x_context(), cond_data, true_num, numel);
+  PADDLE_ENFORCE_EQ(
+      ret,
+      XPU_SUCCESS,
+      phi::errors::External(
+          "XPU nonzero_count kernel return wrong value[%d %s] in WhereIndex",
+          ret,
+          XPUAPIErrorMsg[ret]));
+
+  paddle::memory::Copy(phi::CPUPlace(),
+                       static_cast<void*>(&true_num_cpu),
+                       dev_ctx.GetPlace(),
+                       static_cast<void*>(true_num),
+                       sizeof(int32_t));
+
+  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num_cpu), rank}));
+  auto* out_data = dev_ctx.template Alloc<int64_t>(out);
+
+  if (true_num_cpu == 0) {
+    return;
+  }
+
+  auto condition_shape = phi::vectorize<int>(dims);
+  ret = xpu::where(
+      dev_ctx.x_context(), cond_data, out_data, condition_shape, true_num_cpu);
+  PADDLE_ENFORCE_EQ(ret,
+                    XPU_SUCCESS,
+                    phi::errors::External(
+                        "XPU masked_select kernel return wrong value[%d %s]",
+                        ret,
+                        XPUAPIErrorMsg[ret]));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    where_index, XPU, ALL_LAYOUT, phi::WhereIndexKernel, int, bool, float) {}
diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc
index 396830ca20765..7ae0dc45c5e1b 100644
--- a/paddle/phi/ops/compat/activation_sig.cc
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -16,40 +16,124 @@ limitations under the License. */
 
 namespace phi {
 
-#define DefineActGradDepXOpArgMap(func_name, op_name)                        \
-  KernelSignature func_name##GradOpArgumentMapping(                          \
-      const ArgumentMappingContext& ctx) {                                   \
-    return KernelSignature(                                                  \
-        op_name "_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); \
+#define DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(func_name, op_name, attrs) \
+  KernelSignature func_name##GradOpArgumentMapping(               \
+      const ArgumentMappingContext& ctx) {                        \
+    return KernelSignature(op_name "_grad",                       \
+                           {"X", GradVarName("Out")},             \
+                           {attrs},                               \
+                           {GradVarName("X")});                   \
   }
 
-#define DefineActGradDepOutOpArgMap(func_name, op_name)                        \
-  KernelSignature func_name##GradOpArgumentMapping(                            \
-      const ArgumentMappingContext& ctx) {                                     \
-    return KernelSignature(                                                    \
-        op_name "_grad", {"Out", GradVarName("Out")}, {}, {GradVarName("X")}); \
+#define DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(func_name, op_name, attrs) \
+  KernelSignature func_name##GradOpArgumentMapping(                 \
+      const ArgumentMappingContext& ctx) {                          \
+    return KernelSignature(op_name "_grad",                         \
+                           {"Out", GradVarName("Out")},             \
+                           {attrs},                                 \
+                           {GradVarName("X")});                     \
   }
 
+#define comma ,
+
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cos, "cos", );      // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Tan, "tan", );      // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acos, "acos", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sin, "sin", );      // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asin, "asin", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atan, "atan", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sinh, "sinh", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cosh, "cosh", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asinh, "asinh", );  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acosh, "acosh", );  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atanh, "atanh", );  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(BRelu, "brelu", "t_min" comma "t_max");
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LeakyRelu, "leaky_relu", "alpha");
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(ThresholdedRelu,
+                               "thresholded_relu",
+                               "threshold");
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(SoftShrink, "soft_shrink", "lambda");
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold");
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", );  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", );               // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LogSigmoid, "logsigmoid", );   // NOLINT
+
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", );        // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", );        // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sigmoid, "sigmoid", );  // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(HardSigmoid,
+                                 "hard_sigmoid",
+                                 "slope" comma "offset");  // NOLINT
+
 KernelSignature ReluDoubleGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("relu_double_grad", {"Out", "DDX"}, {}, {"DDOut"});
 }
 
-DefineActGradDepXOpArgMap(Cos, "cos");
-DefineActGradDepXOpArgMap(Tan, "tan");
-DefineActGradDepXOpArgMap(Acos, "acos");
-DefineActGradDepXOpArgMap(Sin, "sin");
-DefineActGradDepXOpArgMap(Asin, "asin");
-DefineActGradDepXOpArgMap(Atan, "atan");
-DefineActGradDepXOpArgMap(Sinh, "sinh");
-DefineActGradDepXOpArgMap(Cosh, "cosh");
-DefineActGradDepXOpArgMap(Asinh, "asinh");
-DefineActGradDepXOpArgMap(Acosh, "acosh");
-DefineActGradDepXOpArgMap(Atanh, "atanh");
-DefineActGradDepOutOpArgMap(Relu, "relu");
+KernelSignature TanhDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "tanh_double_grad", {"Out", "DDX", "DOut"}, {}, {"DOutNew", "DDOut"});
+}
+
+KernelSignature TanhTripleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("tanh_triple_grad",
+                         {"Out", "DDX", "DOut", "D_DDOut", "D_DOut_New"},
+                         {},
+                         {"D_OutNew", "D_DOut", "D_DDx"});
+}
+
+KernelSignature SigmoidDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "sigmoid_double_grad", {"Out", "DDX", "DOut"}, {}, {"DOutNew", "DDOut"});
+}
+
+KernelSignature SigmoidTripleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("sigmoid_triple_grad",
+                         {"Out", "DDX", "DOut", "D_DDOut", "D_DOut_New"},
+                         {},
+                         {"D_OutNew", "D_DOut", "D_DDx"});
+}
+
+KernelSignature LeakyReluDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "leaky_relu_double_grad", {"X", "DDX"}, {"alpha"}, {"DDOut"});
+}
+
+KernelSignature LeakyReluOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("leaky_relu", {"X"}, {"alpha"}, {"Out"});
+}
+
+KernelSignature EluOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("elu", {"X"}, {"alpha"}, {"Out"});
+}
+
+KernelSignature EluGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("elu_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"alpha"},
+                         {GradVarName("X")});
+}
+
+KernelSignature EluDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "elu_double_grad", {"X", "DOut", "DDX"}, {"alpha"}, {"DX", "DDOut"});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(tanh_grad_grad, tanh_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(softshrink, soft_shrink);
+PD_REGISTER_BASE_KERNEL_NAME(softshrink_grad, soft_shrink_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elu_grad_grad, elu_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(sigmoid_grad_grad, sigmoid_double_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping);
@@ -65,3 +149,35 @@ PD_REGISTER_ARG_MAPPING_FN(atanh_grad, phi::AtanhGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(relu_grad_grad,
                            phi::ReluDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tanh_grad, phi::TanhGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tanh_grad_grad,
+                           phi::TanhDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tanh_triple_grad,
+                           phi::TanhTripleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(brelu_grad, phi::BReluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(leaky_relu, phi::LeakyReluOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad,
+                           phi::LeakyReluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad_grad,
+                           phi::LeakyReluDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(thresholded_relu_grad,
+                           phi::ThresholdedReluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(softshrink_grad,
+                           phi::SoftShrinkGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(hard_shrink_grad,
+                           phi::HardShrinkGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tanh_shrink_grad,
+                           phi::TanhShrinkGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elu, phi::EluOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elu_grad, phi::EluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elu_grad_grad, phi::EluDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(silu_grad, phi::SiluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad, phi::SigmoidGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad_grad,
+                           phi::SigmoidDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sigmoid_triple_grad,
+                           phi::SigmoidTripleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(logsigmoid_grad,
+                           phi::LogSigmoidGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(hard_sigmoid_grad,
+                           phi::HardSigmoidGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/allclose_sig.cc b/paddle/phi/ops/compat/allclose_sig.cc
new file mode 100644
index 0000000000000..e5c4fc027b542
--- /dev/null
+++ b/paddle/phi/ops/compat/allclose_sig.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature AllCloseOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("Rtol")) {
+    if (ctx.HasInput("Atol")) {
+      return KernelSignature("allclose",
+                             {"Input", "Other"},
+                             {"Rtol", "Atol", "equal_nan"},
+                             {"Out"});
+    } else {
+      return KernelSignature("allclose",
+                             {"Input", "Other"},
+                             {"Rtol", "atol", "equal_nan"},
+                             {"Out"});
+    }
+  } else {
+    if (ctx.HasInput("Atol")) {
+      return KernelSignature("allclose",
+                             {"Input", "Other"},
+                             {"rtol", "Atol", "equal_nan"},
+                             {"Out"});
+    } else {
+      return KernelSignature("allclose",
+                             {"Input", "Other"},
+                             {"rtol", "atol", "equal_nan"},
+                             {"Out"});
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(allclose, phi::AllCloseOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/argsort_sig.cc b/paddle/phi/ops/compat/argsort_sig.cc
new file mode 100644
index 0000000000000..62133a441ff12
--- /dev/null
+++ b/paddle/phi/ops/compat/argsort_sig.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ArgsortGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("argsort_grad",
+                         {"Indices", "X", GradVarName("Out")},
+                         {"axis", "descending"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(argsort_grad, phi::ArgsortGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/assign_sig.cc b/paddle/phi/ops/compat/assign_sig.cc
new file mode 100644
index 0000000000000..d149e8e6a9aa0
--- /dev/null
+++ b/paddle/phi/ops/compat/assign_sig.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature AssignOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("X")) {
+    if (ctx.IsDenseTensorVectorInput("X")) {
+      return KernelSignature("assign_array", {"X"}, {}, {"Out"});
+    } else if (ctx.IsSelectedRowsInput("X")) {
+      return KernelSignature("assign_sr", {"X"}, {}, {"Out"});
+    } else {
+      return KernelSignature("assign", {"X"}, {}, {"Out"});
+    }
+  } else {
+    return KernelSignature("assign", {"X"}, {}, {"Out"});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(assign, phi::AssignOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/batch_norm_sig.cc b/paddle/phi/ops/compat/batch_norm_sig.cc
index 011d4c12ecefc..fa1fac5d23779 100644
--- a/paddle/phi/ops/compat/batch_norm_sig.cc
+++ b/paddle/phi/ops/compat/batch_norm_sig.cc
@@ -17,21 +17,35 @@
 namespace phi {
 
 KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("batch_norm",
-                         {"X", "Scale", "Bias", "Mean", "Variance"},
-                         {"momentum",
-                          "epsilon",
-                          "data_layout",
-                          "is_test",
-                          "use_global_stats",
-                          "trainable_statistics",
-                          "fuse_with_relu"},
-                         {"Y",
-                          "MeanOut",
-                          "VarianceOut",
-                          "SavedMean",
-                          "SavedVariance",
-                          "ReserveSpace"});
+  bool is_test = paddle::any_cast<bool>(ctx.Attr("is_test"));
+  bool use_global_stats = paddle::any_cast<bool>(ctx.Attr("use_global_stats"));
+  bool trainable_statistics =
+      paddle::any_cast<bool>(ctx.Attr("trainable_statistics"));
+  bool fuse_with_relu = paddle::any_cast<bool>(ctx.Attr("fuse_with_relu"));
+  // Dispenable `MomentumTensor` is useless now
+  if (is_test && !use_global_stats && !trainable_statistics &&
+      !fuse_with_relu) {
+    return KernelSignature("batch_norm_infer",
+                           {"X", "Scale", "Bias", "Mean", "Variance"},
+                           {"momentum", "epsilon", "data_layout"},
+                           {"Y", "MeanOut", "VarianceOut"});
+  } else {
+    return KernelSignature("batch_norm",
+                           {"X", "Scale", "Bias", "Mean", "Variance"},
+                           {"momentum",
+                            "epsilon",
+                            "data_layout",
+                            "is_test",
+                            "use_global_stats",
+                            "trainable_statistics",
+                            "fuse_with_relu"},
+                           {"Y",
+                            "MeanOut",
+                            "VarianceOut",
+                            "SavedMean",
+                            "SavedVariance",
+                            "ReserveSpace"});
+  }
 }
 
 KernelSignature BatchNormGradOpArgumentMapping(
diff --git a/paddle/phi/ops/compat/cholesky_solve_sig.cc b/paddle/phi/ops/compat/cholesky_solve_sig.cc
new file mode 100644
index 0000000000000..6a9759f8352a0
--- /dev/null
+++ b/paddle/phi/ops/compat/cholesky_solve_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature CholeskySolveGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("cholesky_solve_grad",
+                         {"X", "Y", "Out", GradVarName("Out")},
+                         {"upper"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(cholesky_solve_grad,
+                           phi::CholeskySolveGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/conv_transpose_sig.cc b/paddle/phi/ops/compat/conv_transpose_sig.cc
new file mode 100644
index 0000000000000..8697168b82747
--- /dev/null
+++ b/paddle/phi/ops/compat/conv_transpose_sig.cc
@@ -0,0 +1,141 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature Conv2dTransposeOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d_transpose",
+                         {"Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "output_padding",
+                          "output_size",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format"},
+                         {"Output"});
+}
+
+KernelSignature Conv2dTransposeGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d_transpose_grad",
+                         {"Input", "Filter", GradVarName("Output")},
+                         {"strides",
+                          "paddings",
+                          "output_padding",
+                          "output_size",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format"},
+                         {GradVarName("Input"), GradVarName("Filter")});
+}
+
+KernelSignature Conv2dTransposeDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d_transpose_grad_grad",
+                         {"Input", "Filter", "DOutput", "DDInput", "DDFilter"},
+                         {"strides",
+                          "paddings",
+                          "output_padding",
+                          "output_size",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format"},
+                         {"DInput", "DFilter", "DDOutput"});
+}
+
+KernelSignature Conv3dTransposeOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv3d_transpose",
+                         {"Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "output_padding",
+                          "output_size",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format"},
+                         {"Output"});
+}
+
+KernelSignature Conv3dTransposeGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv3d_transpose_grad",
+                         {"Input", "Filter", GradVarName("Output")},
+                         {"strides",
+                          "paddings",
+                          "output_padding",
+                          "output_size",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format"},
+                         {GradVarName("Input"), GradVarName("Filter")});
+}
+
+KernelSignature DepthwiseConv2dTransposeOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("depthwise_conv2d_transpose",
+                         {"Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "output_padding",
+                          "output_size",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format"},
+                         {"Output"});
+}
+
+KernelSignature DepthwiseConv2dTransposeGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("depthwise_conv2d_transpose_grad",
+                         {"Input", "Filter", GradVarName("Output")},
+                         {"strides",
+                          "paddings",
+                          "output_padding",
+                          "output_size",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format"},
+                         {GradVarName("Input"), GradVarName("Filter")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(conv2d_transpose,
+                           phi::Conv2dTransposeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv2d_transpose_grad,
+                           phi::Conv2dTransposeGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv2d_transpose_grad_grad,
+                           phi::Conv2dTransposeDoubleGradOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(conv3d_transpose,
+                           phi::Conv3dTransposeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv3d_transpose_grad,
+                           phi::Conv3dTransposeGradOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_transpose,
+                           phi::DepthwiseConv2dTransposeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_transpose_grad,
+                           phi::DepthwiseConv2dTransposeGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/cumprod_sig.cc b/paddle/phi/ops/compat/cumprod_sig.cc
new file mode 100644
index 0000000000000..01084e764ed9e
--- /dev/null
+++ b/paddle/phi/ops/compat/cumprod_sig.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature CumprodGradGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("cumprod_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"dim"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(cumprod_grad, phi::CumprodGradGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/deformable_conv_sig.cc b/paddle/phi/ops/compat/deformable_conv_sig.cc
new file mode 100644
index 0000000000000..e2a21673634c3
--- /dev/null
+++ b/paddle/phi/ops/compat/deformable_conv_sig.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DeformableConvOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("deformable_conv",
+                         {"Input", "Offset", "Filter", "Mask"},
+                         {"strides",
+                          "paddings",
+                          "dilations",
+                          "deformable_groups",
+                          "groups",
+                          "im2col_step"},
+                         {"Output"});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(deformable_conv,
+                           phi::DeformableConvOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/determinant_sig.cc b/paddle/phi/ops/compat/determinant_sig.cc
new file mode 100644
index 0000000000000..7bcd30ec5d79b
--- /dev/null
+++ b/paddle/phi/ops/compat/determinant_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DeterminantGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("determinant_grad",
+                         {"Input", "Out", GradVarName("Out")},
+                         {},
+                         {GradVarName("Input")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(determinant_grad,
+                           phi::DeterminantGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/diag_sig.cc b/paddle/phi/ops/compat/diag_sig.cc
index 0a14b9095c834..f3245b922c0d9 100644
--- a/paddle/phi/ops/compat/diag_sig.cc
+++ b/paddle/phi/ops/compat/diag_sig.cc
@@ -20,8 +20,15 @@ KernelSignature DiagOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("diag", {"X"}, {"offset", "padding_value"}, {"Out"});
 }
 
+KernelSignature DiagGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "diag_grad", {"X", GradVarName("Out")}, {"offset"}, {GradVarName("X")});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(diag_v2, diag);
+PD_REGISTER_BASE_KERNEL_NAME(diag_v2_grad, diag_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(diag_v2, phi::DiagOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(diag_v2_grad, phi::DiagGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/dropout_sig.cc b/paddle/phi/ops/compat/dropout_sig.cc
new file mode 100644
index 0000000000000..6bf229c98bd07
--- /dev/null
+++ b/paddle/phi/ops/compat/dropout_sig.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DropoutOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "dropout",
+      {"X", "Seed"},
+      {"dropout_prob", "is_test", "dropout_implementation", "seed", "fix_seed"},
+      {"Out", "Mask"});
+}
+
+KernelSignature DropoutGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("dropout_grad",
+                         {"Mask", GradVarName("Out")},
+                         {"dropout_prob", "is_test", "dropout_implementation"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(dropout, phi::DropoutOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(dropout_grad, phi::DropoutGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc
index d4a25866907a0..1d2aaa04f05d2 100644
--- a/paddle/phi/ops/compat/elementwise_sig.cc
+++ b/paddle/phi/ops/compat/elementwise_sig.cc
@@ -114,6 +114,14 @@ KernelSignature ElementwiseDivGradOpArgumentMapping(
                          {GradVarName("X"), GradVarName("Y")});
 }
 
+KernelSignature ElementwiseFMinGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("elementwise_fmin_grad",
+                         {"X", "Y", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
 KernelSignature ElementwiseDivDoubleGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("divide_double_grad",
@@ -122,6 +130,39 @@ KernelSignature ElementwiseDivDoubleGradOpArgumentMapping(
                          {GradVarName("Y"), "DOut", "DDOut"});
 }
 
+KernelSignature ElementwiseMulGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("multiply_grad",
+                         {"X", "Y", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+KernelSignature ElementwiseFMaxGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("elementwise_fmax_grad",
+                         {"X", "Y", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+KernelSignature ElementwiseMulDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("multiply_double_grad",
+                         {"X", "Y", "DOut", "DDX", "DDY"},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y"), "DDOut"});
+}
+
+KernelSignature ElementwiseMulTripleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "multiply_triple_grad",
+      {"X", "Y", "DOut", "DDX", "DDY", "D_DX", "D_DY", "D_DDOut"},
+      {"axis"},
+      {"D_X", "D_Y", "D_DOut", "D_DDX", "D_DDY"});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add, add);
@@ -135,6 +176,9 @@ PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad_grad, subtract_double_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad, divide_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad_grad, divide_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_grad, multiply_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_grad_grad, multiply_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_triple_grad, multiply_triple_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(elementwise_add,
                            phi::ElementwiseAddOpArgumentMapping);
@@ -158,3 +202,15 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_div_grad,
                            phi::ElementwiseDivGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_div_grad_grad,
                            phi::ElementwiseDivDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_grad,
+                           phi::ElementwiseMulGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_grad_grad,
+                           phi::ElementwiseMulDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_triple_grad,
+                           phi::ElementwiseMulTripleGradOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(elementwise_fmax_grad,
+                           phi::ElementwiseFMaxGradOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(elementwise_fmin_grad,
+                           phi::ElementwiseFMinGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/erf_sig.cc b/paddle/phi/ops/compat/erf_sig.cc
new file mode 100644
index 0000000000000..784727a98042d
--- /dev/null
+++ b/paddle/phi/ops/compat/erf_sig.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ErfGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "erf_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(erf_grad, phi::ErfGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/expand_as_sig.cc b/paddle/phi/ops/compat/expand_as_sig.cc
new file mode 100644
index 0000000000000..a616b63c10b3c
--- /dev/null
+++ b/paddle/phi/ops/compat/expand_as_sig.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ExpandAsOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("expand_as", {"X", "Y"}, {"target_shape"}, {"Out"});
+}
+
+KernelSignature ExpandAsGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("expand_as_grad",
+                         {"X", GradVarName("Out")},
+                         {"target_shape"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(expand_as_v2, expand_as);
+PD_REGISTER_BASE_KERNEL_NAME(expand_as_v2_grad, expand_as_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(expand_as_v2, phi::ExpandAsOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(expand_as_v2_grad,
+                           phi::ExpandAsGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/frobenius_norm_sig.cc b/paddle/phi/ops/compat/frobenius_norm_sig.cc
new file mode 100644
index 0000000000000..c6dc5ad9014ec
--- /dev/null
+++ b/paddle/phi/ops/compat/frobenius_norm_sig.cc
@@ -0,0 +1,38 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature FrobeniusNormOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "frobenius_norm", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+}
+
+KernelSignature FrobeniusNormGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "frobenius_norm_grad",
+      {"X", "Out", GradVarName("Out")},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(frobenius_norm, phi::FrobeniusNormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(frobenius_norm_grad,
+                           phi::FrobeniusNormGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/gather_sig.cc b/paddle/phi/ops/compat/gather_sig.cc
new file mode 100644
index 0000000000000..6c47bbe48b8ee
--- /dev/null
+++ b/paddle/phi/ops/compat/gather_sig.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GatherOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("Axis")) {
+    return KernelSignature("gather", {"X", "Index"}, {"Axis"}, {"Out"});
+  } else {
+    return KernelSignature("gather", {"X", "Index"}, {"axis"}, {"Out"});
+  }
+}
+
+KernelSignature GatherGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("Axis")) {
+    return KernelSignature("gather_grad",
+                           {"X", "Index", GradVarName("Out")},
+                           {"Axis", "overwrite"},
+                           {GradVarName("X")});
+  } else {
+    return KernelSignature("gather_grad",
+                           {"X", "Index", GradVarName("Out")},
+                           {"axis", "overwrite"},
+                           {GradVarName("X")});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(gather, phi::GatherOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(gather_grad, phi::GatherGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/gaussian_random_sig.cc b/paddle/phi/ops/compat/gaussian_random_sig.cc
index cddcb80ebea3d..2f2b157e4c0f9 100644
--- a/paddle/phi/ops/compat/gaussian_random_sig.cc
+++ b/paddle/phi/ops/compat/gaussian_random_sig.cc
@@ -18,14 +18,23 @@ namespace phi {
 
 KernelSignature GaussianRandomOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
+  const auto& shape = paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
   if (ctx.InputSize("ShapeTensorList") > 0) {
-    return KernelSignature("gaussian_random",
-                           {},
-                           {"ShapeTensorList", "mean", "std", "seed", "dtype"},
-                           {"Out"});
+    // Infer output shape by Attr("shape") in CompileTime if it is specified.
+    if (!ctx.IsRuntime() && !shape.empty()) {
+      return KernelSignature("gaussian_random",
+                             {},
+                             {"shape", "mean", "std", "seed", "dtype"},
+                             {"Out"});
+    } else {
+      return KernelSignature(
+          "gaussian_random",
+          {},
+          {"ShapeTensorList", "mean", "std", "seed", "dtype"},
+          {"Out"});
+    }
   }
 
-  const auto& shape = paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
   if (ctx.HasInput("ShapeTensor") && shape.empty()) {
     return KernelSignature("gaussian_random",
                            {},
diff --git a/paddle/phi/ops/compat/gelu_sig.cc b/paddle/phi/ops/compat/gelu_sig.cc
new file mode 100644
index 0000000000000..bf4b47bcf5fa9
--- /dev/null
+++ b/paddle/phi/ops/compat/gelu_sig.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GeluOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("gelu", {"X"}, {"approximate"}, {"Out"});
+}
+
+KernelSignature GeluGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("gelu_grad",
+                         {"X", GradVarName("Out")},
+                         {"approximate"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(gelu_grad, phi::GeluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(gelu, phi::GeluOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/grid_sampler_sig.cc b/paddle/phi/ops/compat/grid_sampler_sig.cc
new file mode 100644
index 0000000000000..b76a9770d4ded
--- /dev/null
+++ b/paddle/phi/ops/compat/grid_sampler_sig.cc
@@ -0,0 +1,43 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GridSamplerOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("grid_sample",
+                         {"X", "Grid"},
+                         {"mode", "padding_mode", "align_corners"},
+                         {"Output"});
+}
+
+KernelSignature GridSamplerGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("grid_sample_grad",
+                         {"X", "Grid", GradVarName("Output")},
+                         {"mode", "padding_mode", "align_corners"},
+                         {GradVarName("X"), GradVarName("Grid")});
+}
+
+}  // namespace phi
+
+// use Python API name as kernel name
+PD_REGISTER_BASE_KERNEL_NAME(grid_sampler, grid_sample);
+PD_REGISTER_BASE_KERNEL_NAME(grid_sampler_grad, grid_sample_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(grid_sampler, phi::GridSamplerOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(grid_sampler_grad,
+                           phi::GridSamplerGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc
new file mode 100644
index 0000000000000..20183d1a9b066
--- /dev/null
+++ b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature HierarchicalSigmoidOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("hierarchical_sigmoid",
+                         {"X", "W", "Label", "PathTable", "PathCode", "Bias"},
+                         {"num_classes",
+                          "remote_prefetch",
+                          "trainer_id",
+                          "height_sections",
+                          "epmap",
+                          "table_names",
+                          "is_sparse"},
+                         {"Out", "PreOut", "W_Out"});
+}
+
+KernelSignature HierarchicalSigmoidGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorOutput(GradVarName("W"))) {
+    return KernelSignature(
+        "hierarchical_sigmoid_grad",
+        {"X",
+         "W",
+         "Label",
+         "PreOut",
+         GradVarName("Out"),
+         "PathTable",
+         "PathCode",
+         "Bias"},
+        {"num_classes",
+         "remote_prefetch",
+         "trainer_id",
+         "height_sections",
+         "epmap",
+         "table_names",
+         "is_sparse"},
+        {GradVarName("X"), GradVarName("W"), GradVarName("Bias")});
+  } else if (ctx.IsSelectedRowsOutput(GradVarName("W"))) {
+    return KernelSignature(
+        "hierarchical_sigmoid_grad_sr",
+        {"X",
+         "W",
+         "Label",
+         "PreOut",
+         GradVarName("Out"),
+         "PathTable",
+         "PathCode",
+         "Bias"},
+        {"num_classes",
+         "remote_prefetch",
+         "trainer_id",
+         "height_sections",
+         "epmap",
+         "table_names",
+         "is_sparse"},
+        {GradVarName("X"), GradVarName("W"), GradVarName("Bias")});
+  } else {
+    return KernelSignature("unregistered", {}, {}, {});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(hierarchical_sigmoid,
+                           phi::HierarchicalSigmoidOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(hierarchical_sigmoid_grad,
+                           phi::HierarchicalSigmoidGradOpArgumentMapping);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu b/paddle/phi/ops/compat/index_select_sig.cc
similarity index 50%
rename from paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
rename to paddle/phi/ops/compat/index_select_sig.cc
index a578c9f7d8108..53eff1bbcd7ed 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
+++ b/paddle/phi/ops/compat/index_select_sig.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,14 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// .part used to speed up nvcc compile
-#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
+#include "paddle/phi/core/compat/op_utils.h"
 
-template <typename T>
-using CUDAReduceMeanGradKernel =
-    ops::ReduceCudaGradKernel<T, kps::DivideFunctor>;
+namespace phi {
 
-REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel<bool>,
-                        CUDAReduceMeanGradKernel<paddle::platform::float16>,
-                        CUDAReduceMeanGradKernel<float>,
-                        CUDAReduceMeanGradKernel<double>);
+KernelSignature IndexSelectGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("index_select_grad",
+                         {"X", "Index", GradVarName("Out")},
+                         {"dim"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(index_select_grad,
+                           phi::IndexSelectGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/isclose_sig.cc b/paddle/phi/ops/compat/isclose_sig.cc
new file mode 100644
index 0000000000000..08632e990958d
--- /dev/null
+++ b/paddle/phi/ops/compat/isclose_sig.cc
@@ -0,0 +1,50 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature IscloseOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("Rtol")) {
+    if (ctx.HasInput("Atol")) {
+      return KernelSignature("isclose",
+                             {"Input", "Other"},
+                             {"Rtol", "Atol", "equal_nan"},
+                             {"Out"});
+
+    } else {
+      return KernelSignature("isclose",
+                             {"Input", "Other"},
+                             {"Rtol", "atol", "equal_nan"},
+                             {"Out"});
+    }
+  } else {
+    if (ctx.HasInput("Atol")) {
+      return KernelSignature("isclose",
+                             {"Input", "Other"},
+                             {"rtol", "Atol", "equal_nan"},
+                             {"Out"});
+    } else {
+      return KernelSignature("isclose",
+                             {"Input", "Other"},
+                             {"rtol", "atol", "equal_nan"},
+                             {"Out"});
+    }
+  }
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(isclose, phi::IscloseOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/kldiv_loss_sig.cc b/paddle/phi/ops/compat/kldiv_loss_sig.cc
new file mode 100644
index 0000000000000..22d2f074e9f13
--- /dev/null
+++ b/paddle/phi/ops/compat/kldiv_loss_sig.cc
@@ -0,0 +1,30 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature KLDivLossGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("kldiv_loss_grad",
+                         {"X", "Target", GradVarName("Loss")},
+                         {"reduction"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(kldiv_loss_grad,
+                           phi::KLDivLossGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/kron_sig.cc b/paddle/phi/ops/compat/kron_sig.cc
new file mode 100644
index 0000000000000..06b6545f58e7c
--- /dev/null
+++ b/paddle/phi/ops/compat/kron_sig.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature KronGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("kron_grad",
+                         {"X", "Y", GradVarName("Out")},
+                         {},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(kron_grad, phi::KronGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/kthvalue_sig.cc b/paddle/phi/ops/compat/kthvalue_sig.cc
new file mode 100644
index 0000000000000..e59e9de1e4382
--- /dev/null
+++ b/paddle/phi/ops/compat/kthvalue_sig.cc
@@ -0,0 +1,29 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature KthvalueGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("kthvalue_grad",
+                         {GradVarName("Out"), "X", "Indices"},
+                         {"k", "axis", "keepdim"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(kthvalue_grad, phi::KthvalueGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/layer_norm_sig.cc b/paddle/phi/ops/compat/layer_norm_sig.cc
new file mode 100644
index 0000000000000..17a81e9ec012f
--- /dev/null
+++ b/paddle/phi/ops/compat/layer_norm_sig.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LayerNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("layer_norm",
+                         {"X", "Scale", "Bias"},
+                         {"epsilon", "begin_norm_axis", "is_test"},
+                         {"Y", "Mean", "Variance"});
+}
+
+KernelSignature LayerNormGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "layer_norm_grad",
+      {"X", "Mean", "Variance", "Scale", "Bias", GradVarName("Y")},
+      {"epsilon", "begin_norm_axis", "is_test"},
+      {GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(layer_norm, phi::LayerNormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(layer_norm_grad,
+                           phi::LayerNormGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/lgamma_sig.cc b/paddle/phi/ops/compat/lgamma_sig.cc
new file mode 100644
index 0000000000000..968ad4923ba7b
--- /dev/null
+++ b/paddle/phi/ops/compat/lgamma_sig.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LgammaGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "lgamma_grad", {GradVarName("Out"), "X"}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(lgamma_grad, phi::LgammaGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/log_softmax_sig.cc b/paddle/phi/ops/compat/log_softmax_sig.cc
new file mode 100644
index 0000000000000..b1ecc6d56768f
--- /dev/null
+++ b/paddle/phi/ops/compat/log_softmax_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LogSoftmaxGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("log_softmax_grad",
+                         {"Out", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(log_softmax_grad,
+                           phi::LogSoftmaxGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/matrix_power_sig.cc b/paddle/phi/ops/compat/matrix_power_sig.cc
new file mode 100644
index 0000000000000..4c9ad4e74ab46
--- /dev/null
+++ b/paddle/phi/ops/compat/matrix_power_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MatrixPowerGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("matrix_power_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"n"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(matrix_power_grad,
+                           phi::MatrixPowerGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/matrix_rank_sig.cc b/paddle/phi/ops/compat/matrix_rank_sig.cc
new file mode 100644
index 0000000000000..40dc29579b401
--- /dev/null
+++ b/paddle/phi/ops/compat/matrix_rank_sig.cc
@@ -0,0 +1,38 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+// we have to return every specific KernelSignature for infrt now
+KernelSignature MatrixRankOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("TolTensor")) {
+    return KernelSignature("matrix_rank_tol",
+                           {"X", "TolTensor"},
+                           {"use_default_tol", "hermitian"},
+                           {"Out"});
+  } else {
+    return KernelSignature("matrix_rank",
+                           {"X"},
+                           {
+                               "tol", "use_default_tol", "hermitian",
+                           },
+                           {"Out"});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(matrix_rank, phi::MatrixRankOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/mode_sig.cc b/paddle/phi/ops/compat/mode_sig.cc
new file mode 100644
index 0000000000000..20994c08aa73c
--- /dev/null
+++ b/paddle/phi/ops/compat/mode_sig.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ModeOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "mode", {"X"}, {"axis", "keepdim"}, {"Out", "Indices"});
+}
+
+KernelSignature ModeGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("mode_grad",
+                         {"X", "Indices", GradVarName("Out")},
+                         {"axis", "keepdim"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(mode, phi::ModeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(mode_grad, phi::ModeGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/multiplex_sig.cc b/paddle/phi/ops/compat/multiplex_sig.cc
new file mode 100644
index 0000000000000..9dab4655d1723
--- /dev/null
+++ b/paddle/phi/ops/compat/multiplex_sig.cc
@@ -0,0 +1,32 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MultiplexOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("multiplex", {"X", "Ids"}, {}, {"Out"});
+}
+
+KernelSignature MultiplexGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "multiplex_grad", {"Ids", GradVarName("Out")}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(multiplex, phi::MultiplexOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(multiplex_grad, phi::MultiplexGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/one_hot_sig.cc b/paddle/phi/ops/compat/one_hot_sig.cc
new file mode 100644
index 0000000000000..655969093c889
--- /dev/null
+++ b/paddle/phi/ops/compat/one_hot_sig.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature OneHotOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("depth_tensor")) {
+    return KernelSignature("one_hot_raw",
+                           {"X"},
+                           {"depth_tensor", "dtype", "allow_out_of_range"},
+                           {"Out"});
+  } else {
+    return KernelSignature("one_hot_raw",
+                           {"X"},
+                           {"depth", "dtype", "allow_out_of_range"},
+                           {"Out"});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(one_hot_v2, one_hot);
+
+PD_REGISTER_ARG_MAPPING_FN(one_hot_v2, phi::OneHotOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/pad3d_sig.cc b/paddle/phi/ops/compat/pad3d_sig.cc
new file mode 100644
index 0000000000000..c43b98fa27e6b
--- /dev/null
+++ b/paddle/phi/ops/compat/pad3d_sig.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature Pad3dOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("Paddings")) {
+    return KernelSignature(
+        "pad3d", {"X"}, {"Paddings", "mode", "value", "data_format"}, {"Out"});
+  }
+
+  return KernelSignature(
+      "pad3d", {"X"}, {"paddings", "mode", "value", "data_format"}, {"Out"});
+}
+
+KernelSignature Pad3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("Paddings")) {
+    return KernelSignature("pad3d_grad",
+                           {"X", GradVarName("Out")},
+                           {"Paddings", "mode", "value", "data_format"},
+                           {GradVarName("X")});
+  }
+  return KernelSignature("pad3d_grad",
+                         {"X", GradVarName("Out")},
+                         {"paddings", "mode", "value", "data_format"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(pad3d, phi::Pad3dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(pad3d_grad, phi::Pad3dGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/pool_sig.cc b/paddle/phi/ops/compat/pool_sig.cc
new file mode 100644
index 0000000000000..390d3db5e785b
--- /dev/null
+++ b/paddle/phi/ops/compat/pool_sig.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature Pool2dOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("pool2d",
+                         {"X"},
+                         {"ksize",
+                          "strides",
+                          "paddings",
+                          "ceil_mode",
+                          "exclusive",
+                          "data_format",
+                          "pooling_type",
+                          "global_pooling",
+                          "adaptive",
+                          "padding_algorithm"},
+                         {"Out"});
+}
+
+KernelSignature Pool2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("pool2d_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"ksize",
+                          "strides",
+                          "paddings",
+                          "ceil_mode",
+                          "exclusive",
+                          "data_format",
+                          "pooling_type",
+                          "global_pooling",
+                          "adaptive",
+                          "padding_algorithm"},
+                         {GradVarName("X")});
+}
+
+KernelSignature Pool2dDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("pool2d_double_grad",
+                         {"X"},
+                         {"ksize",
+                          "strides",
+                          "paddings",
+                          "ceil_mode",
+                          "exclusive",
+                          "data_format",
+                          "pooling_type",
+                          "global_pooling",
+                          "adaptive",
+                          "padding_algorithm"},
+                         {"Out"});
+}
+
+KernelSignature MaxPool2dWithIndexOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "max_pool2d_with_index",
+      {"X"},
+      {"ksize", "strides", "paddings", "global_pooling", "adaptive"},
+      {"Out", "Mask"});
+}
+
+KernelSignature MaxPool2dWithIndexGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "max_pool2d_with_index_grad",
+      {"X", "Mask", GradVarName("Out")},
+      {"ksize", "strides", "paddings", "global_pooling", "adaptive"},
+      {GradVarName("X")});
+}
+
+KernelSignature Pool3dOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("pool3d",
+                         {"X"},
+                         {"ksize",
+                          "strides",
+                          "paddings",
+                          "ceil_mode",
+                          "exclusive",
+                          "data_format",
+                          "pooling_type",
+                          "global_pooling",
+                          "adaptive",
+                          "padding_algorithm"},
+                         {"Out"});
+}
+
+KernelSignature Pool3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("pool3d_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"ksize",
+                          "strides",
+                          "paddings",
+                          "ceil_mode",
+                          "exclusive",
+                          "data_format",
+                          "pooling_type",
+                          "global_pooling",
+                          "adaptive",
+                          "padding_algorithm"},
+                         {GradVarName("X")});
+}
+
+KernelSignature MaxPool3dWithIndexOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "max_pool3d_with_index",
+      {"X"},
+      {"ksize", "strides", "paddings", "global_pooling", "adaptive"},
+      {"Out", "Mask"});
+}
+
+KernelSignature MaxPool3dWithIndexGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "max_pool3d_with_index_grad",
+      {"X", "Mask", GradVarName("Out")},
+      {"ksize", "strides", "paddings", "global_pooling", "adaptive"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(pool2d, phi::Pool2dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(pool2d_grad, phi::Pool2dGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(pool2d_double_grad,
+                           phi::Pool2dDoubleGradOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(max_pool2d_with_index,
+                           phi::MaxPool2dWithIndexOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(max_pool2d_with_index_grad,
+                           phi::MaxPool2dWithIndexGradOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(pool3d, phi::Pool3dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(pool3d_grad, phi::Pool3dGradOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(max_pool3d_with_index,
+                           phi::MaxPool3dWithIndexOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(max_pool3d_with_index_grad,
+                           phi::MaxPool3dWithIndexGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/prelu_sig.cc b/paddle/phi/ops/compat/prelu_sig.cc
new file mode 100644
index 0000000000000..bd296c5e95318
--- /dev/null
+++ b/paddle/phi/ops/compat/prelu_sig.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PReluGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("prelu_grad",
+                         {"X", "Alpha", GradVarName("Out")},
+                         {"mode", "data_format"},
+                         {GradVarName("X"), GradVarName("Alpha")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(prelu_grad, phi::PReluGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/psroi_pool_sig.cc b/paddle/phi/ops/compat/psroi_pool_sig.cc
new file mode 100644
index 0000000000000..4d694d9a7759d
--- /dev/null
+++ b/paddle/phi/ops/compat/psroi_pool_sig.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PsroiPoolOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "psroi_pool",
+      {"X", "ROIs", "RoisNum"},
+      {"pooled_height", "pooled_width", "output_channels", "spatial_scale"},
+      {"Out"});
+}
+
+KernelSignature PsroiPoolGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "psroi_pool_grad",
+      {"X", "ROIs", "RoisNum", GradVarName("Out")},
+      {"pooled_height", "pooled_width", "output_channels", "spatial_scale"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(psroi_pool, phi::PsroiPoolOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(psroi_pool_grad,
+                           phi::PsroiPoolGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/qr_sig.cc b/paddle/phi/ops/compat/qr_sig.cc
new file mode 100644
index 0000000000000..dd424d590ee11
--- /dev/null
+++ b/paddle/phi/ops/compat/qr_sig.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature QrOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("qr", {"X"}, {"mode"}, {"Q", "R"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(qr, phi::QrOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc
index 36798abe4c11b..4bca0523801c1 100644
--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
@@ -41,8 +41,7 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) {
     // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
     // InferShape, so we must return the "mean_raw" KernelSignature.
     // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
-    // the
-    // "mean_raw" KernelSignature
+    // the "mean_raw" KernelSignature
     if (ctx.IsForInferShape() || reduce_all) {
       return KernelSignature(
           "mean_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
@@ -53,8 +52,19 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) {
 }
 
 KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "reduce_prod", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "max_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the "max_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "prod_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("prod", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
 }
 
 KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
@@ -63,8 +73,7 @@ KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
     // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
     // InferShape, so we must return the "max_raw" KernelSignature.
     // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
-    // the
-    // "max_raw" KernelSignature
+    // the "max_raw" KernelSignature
     if (ctx.IsForInferShape() || reduce_all) {
       return KernelSignature(
           "max_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
@@ -74,13 +83,126 @@ KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("unregistered", {}, {}, {});
 }
 
+KernelSignature ReduceMinOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "min_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the "min_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "min_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("min", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+KernelSignature ReduceAnyOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "any_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the "any_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "any_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("any", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+KernelSignature ReduceAllOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "all_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("all", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+KernelSignature ReduceSumGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "sum_grad",
+      {"X", GradVarName("Out")},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+
+KernelSignature ReduceMeanGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "mean_grad",
+      {"X", GradVarName("Out")},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+
+KernelSignature ReduceMaxGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "max_grad",
+      {"X", GradVarName("Out"), "Out"},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+
+KernelSignature ReduceMinGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "min_grad",
+      {"X", GradVarName("Out"), "Out"},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+
+KernelSignature ReduceProdGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "prod_grad",
+      {"X", GradVarName("Out"), "Out"},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_max, max);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_min, min);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_prod, prod);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_all, all);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_any, any);
+
+PD_REGISTER_BASE_KERNEL_NAME(reduce_sum_grad, sum_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_mean_grad, mean_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_prod_grad, prod_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_max_grad, max_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_min_grad, min_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_prod, phi::ReduceProdOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_max, phi::ReduceMaxOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_min, phi::ReduceMinOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_all, phi::ReduceAllOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_any, phi::ReduceAnyOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(reduce_sum_grad,
+                           phi::ReduceSumGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_mean_grad,
+                           phi::ReduceMeanGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_prod_grad,
+                           phi::ReduceProdGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_max_grad,
+                           phi::ReduceMaxGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_min_grad,
+                           phi::ReduceMinGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/roi_align_sig.cc b/paddle/phi/ops/compat/roi_align_sig.cc
new file mode 100644
index 0000000000000..1717ec8f78809
--- /dev/null
+++ b/paddle/phi/ops/compat/roi_align_sig.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RoiAlignOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("roi_align",
+                         {"X", "ROIs", "RoisNum"},
+                         {"pooled_height",
+                          "pooled_width",
+                          "spatial_scale",
+                          "sampling_ratio",
+                          "aligned"},
+                         {"Out"});
+}
+
+KernelSignature RoiAlignGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("roi_align_grad",
+                         {"X", "ROIs", "RoisNum", GradVarName("Out")},
+                         {"pooled_height",
+                          "pooled_width",
+                          "spatial_scale",
+                          "sampling_ratio",
+                          "aligned"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::RoiAlignOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(roi_align_grad, phi::RoiAlignGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/roi_pool_sig.cc b/paddle/phi/ops/compat/roi_pool_sig.cc
new file mode 100644
index 0000000000000..d04c645f183c6
--- /dev/null
+++ b/paddle/phi/ops/compat/roi_pool_sig.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RoiPoolOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("roi_pool",
+                         {"X", "ROIs", "RoisNum"},
+                         {"pooled_height", "pooled_width", "spatial_scale"},
+                         {"Out", "Argmax"});
+}
+
+KernelSignature RoiPoolOpGradArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("roi_pool_grad",
+                         {"X", "ROIs", "RoisNum", "Argmax", GradVarName("Out")},
+                         {"pooled_height", "pooled_width", "spatial_scale"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(roi_pool, phi::RoiPoolOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(roi_pool_grad, phi::RoiPoolOpGradArgumentMapping);
diff --git a/paddle/phi/ops/compat/roll_sig.cc b/paddle/phi/ops/compat/roll_sig.cc
new file mode 100644
index 0000000000000..a144f0e8e8a90
--- /dev/null
+++ b/paddle/phi/ops/compat/roll_sig.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RollOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("ShiftsTensor")) {
+    return KernelSignature("roll", {"X"}, {"ShiftsTensor", "axis"}, {"Out"});
+  }
+  return KernelSignature("roll", {"X"}, {"shifts", "axis"}, {"Out"});
+}
+
+KernelSignature RollGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("roll_grad",
+                         {"X", GradVarName("Out")},
+                         {"shifts", "axis"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(roll, phi::RollOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(roll_grad, phi::RollGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/segment_pool_sig.cc b/paddle/phi/ops/compat/segment_pool_sig.cc
new file mode 100644
index 0000000000000..97646a2ac31d3
--- /dev/null
+++ b/paddle/phi/ops/compat/segment_pool_sig.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SegmentPoolGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "segment_pool_grad",
+      {
+          "X", "SegmentIds", "Out", "SummedIds", GradVarName("Out"),
+      },
+      {"pooltype"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(segment_pool_grad,
+                           phi::SegmentPoolGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/set_value_sig.cc b/paddle/phi/ops/compat/set_value_sig.cc
index eacfff26d53cf..5feff54b028ba 100644
--- a/paddle/phi/ops/compat/set_value_sig.cc
+++ b/paddle/phi/ops/compat/set_value_sig.cc
@@ -19,9 +19,9 @@ namespace phi {
 
 KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
   if (ctx.IsDenseTensorInput("Input")) {
-    if (ctx.HasInput("StartsTensorList")) {
-      if (ctx.HasInput("EndsTensorList")) {
-        if (ctx.HasInput("StepsTensorList")) {
+    if (ctx.InputSize("StartsTensorList") > 0) {
+      if (ctx.InputSize("EndsTensorList") > 0) {
+        if (ctx.InputSize("StepsTensorList") > 0) {
           if (ctx.HasInput("ValueTensor")) {
             return KernelSignature("set_value_with_tensor",
                                    {"Input", "ValueTensor"},
@@ -197,7 +197,7 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
           }
         }
       } else {
-        if (ctx.HasInput("StepsTensorList")) {
+        if (ctx.InputSize("StepsTensorList") > 0) {
           if (ctx.HasInput("ValueTensor")) {
             return KernelSignature("set_value_with_tensor",
                                    {"Input", "ValueTensor"},
@@ -374,8 +374,8 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
         }
       }
     } else {
-      if (ctx.HasInput("EndsTensorList")) {
-        if (ctx.HasInput("StepsTensorList")) {
+      if (ctx.InputSize("EndsTensorList") > 0) {
+        if (ctx.InputSize("StepsTensorList") > 0) {
           if (ctx.HasInput("ValueTensor")) {
             return KernelSignature("set_value_with_tensor",
                                    {"Input", "ValueTensor"},
@@ -551,7 +551,7 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
           }
         }
       } else {
-        if (ctx.HasInput("StepsTensorList")) {
+        if (ctx.InputSize("StepsTensorList") > 0) {
           if (ctx.HasInput("ValueTensor")) {
             return KernelSignature("set_value_with_tensor",
                                    {"Input", "ValueTensor"},
@@ -731,6 +731,108 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
   }
   return KernelSignature("unregistered", {}, {}, {});
 }
+
+KernelSignature SetValueGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  if (ctx.InputSize("StartsTensorList") > 0) {
+    if (ctx.InputSize("EndsTensorList") > 0) {
+      if (ctx.InputSize("StepsTensorList") > 0) {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"StartsTensorList",
+             "EndsTensorList",
+             "StepsTensorList",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      } else {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"StartsTensorList",
+             "EndsTensorList",
+             "steps",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      }
+    } else {
+      if (ctx.InputSize("StepsTensorList") > 0) {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"StartsTensorList",
+             "ends",
+             "StepsTensorList",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      } else {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"StartsTensorList",
+             "ends",
+             "steps",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      }
+    }
+  } else {
+    if (ctx.InputSize("EndsTensorList") > 0) {
+      if (ctx.InputSize("StepsTensorList") > 0) {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"starts",
+             "EndsTensorList",
+             "StepsTensorList",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      } else {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"starts",
+             "EndsTensorList",
+             "steps",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      }
+    } else {
+      if (ctx.InputSize("StepsTensorList") > 0) {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"starts",
+             "ends",
+             "StepsTensorList",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      } else {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"starts", "ends", "steps", "axes", "decrease_axes", "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      }
+    }
+  }
+}
+
 }  // namespace phi
 
 PD_REGISTER_ARG_MAPPING_FN(set_value, phi::SetValueOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(set_value_grad, phi::SetValueGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/tile_sig.cc b/paddle/phi/ops/compat/tile_sig.cc
new file mode 100644
index 0000000000000..ca3fa5fe1f86a
--- /dev/null
+++ b/paddle/phi/ops/compat/tile_sig.cc
@@ -0,0 +1,56 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TileOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("RepeatTimes")) {
+    return KernelSignature("tile", {"X"}, {"RepeatTimes"}, {"Out"});
+  } else if (ctx.InputSize("repeat_times_tensor") > 0) {
+    const auto& repeat_times =
+        paddle::any_cast<std::vector<int>>(ctx.Attr("repeat_times"));
+    if (!ctx.IsRuntime() && !repeat_times.empty()) {
+      return KernelSignature("tile", {"X"}, {"repeat_times"}, {"Out"});
+    }
+    return KernelSignature("tile", {"X"}, {"repeat_times_tensor"}, {"Out"});
+  } else {
+    return KernelSignature("tile", {"X"}, {"repeat_times"}, {"Out"});
+  }
+}
+
+KernelSignature TileGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("RepeatTimes")) {
+    return KernelSignature("tile_grad",
+                           {"X", GradVarName("Out")},
+                           {"RepeatTimes"},
+                           {GradVarName("X")});
+  } else if (ctx.InputSize("repeat_times_tensor") > 0) {
+    return KernelSignature("tile_grad",
+                           {"X", GradVarName("Out")},
+                           {"repeat_times_tensor"},
+                           {GradVarName("X")});
+  } else {
+    return KernelSignature("tile_grad",
+                           {"X", GradVarName("Out")},
+                           {"repeat_times"},
+                           {GradVarName("X")});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(tile, phi::TileOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tile_grad, phi::TileGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/top_k_sig.cc b/paddle/phi/ops/compat/top_k_sig.cc
new file mode 100644
index 0000000000000..9bf922b3d1b58
--- /dev/null
+++ b/paddle/phi/ops/compat/top_k_sig.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TopkOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("K")) {
+    return KernelSignature(
+        "top_k", {"X"}, {"K", "axis", "largest", "sorted"}, {"Out", "Indices"});
+
+  } else {
+    return KernelSignature(
+        "top_k", {"X"}, {"k", "axis", "largest", "sorted"}, {"Out", "Indices"});
+  }
+}
+
+KernelSignature TopkGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("top_k_grad",
+                         {GradVarName("Out"), "X", "Indices"},
+                         {"k", "axis", "largest", "sorted"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(top_k_v2, top_k);
+PD_REGISTER_BASE_KERNEL_NAME(top_k_v2_grad, top_k_grad);
+PD_REGISTER_ARG_MAPPING_FN(top_k_v2, phi::TopkOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(top_k_v2_grad, phi::TopkGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/tril_triu_sig.cc b/paddle/phi/ops/compat/tril_triu_sig.cc
new file mode 100644
index 0000000000000..4f79f8650decf
--- /dev/null
+++ b/paddle/phi/ops/compat/tril_triu_sig.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TrilTriuOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("tril_triu", {"X"}, {"diagonal", "lower"}, {"Out"});
+}
+
+KernelSignature TrilTriuGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("tril_triu_grad",
+                         {GradVarName("Out")},
+                         {"diagonal", "lower"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(tril_triu, phi::TrilTriuOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tril_triu_grad, phi::TrilTriuGradOpArgumentMapping);
diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt
index be12960d1d675..d998ab9435c02 100644
--- a/paddle/phi/tests/api/CMakeLists.txt
+++ b/paddle/phi/tests/api/CMakeLists.txt
@@ -1,28 +1,29 @@
 if(WITH_ROCM)
-  hip_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor phi_function_api glog)
+  hip_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor glog)
 else()
-  cc_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor phi_function_api glog)
+  cc_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor glog)
 endif()
 
 cc_test(test_phi_exception SRCS test_pten_exception.cc DEPS gtest)
 
-cc_test(test_mean_api SRCS test_mean_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_dot_api SRCS test_dot_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_empty_api SRCS test_empty_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_fill_api SRCS test_fill_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_cast_api SRCS test_cast_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_to_api SRCS test_to_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_slice_api SRCS test_slice_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_sum_api SRCS test_sum_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_scale_api SRCS test_scale_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_conj_api SRCS test_conj_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_concat_api SRCS test_concat_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_split_api SRCS test_split_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_data_transform SRCS test_data_transform.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_sparse_conv_api SRCS test_sparse_conv_api.cc DEPS phi_tensor phi_api phi_api_utils)
+set(COMMON_API_TEST_DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_mean_api SRCS test_mean_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_dot_api SRCS test_dot_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_empty_api SRCS test_empty_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_fill_api SRCS test_fill_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_cast_api SRCS test_cast_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_to_api SRCS test_to_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_slice_api SRCS test_slice_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_sum_api SRCS test_sum_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_scale_api SRCS test_scale_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_conj_api SRCS test_conj_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_concat_api SRCS test_concat_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_split_api SRCS test_split_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_data_transform SRCS test_data_transform.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_sparse_conv_api SRCS test_sparse_conv_api.cc DEPS ${COMMON_API_TEST_DEPS})
diff --git a/paddle/phi/tests/api/test_cast_api.cc b/paddle/phi/tests/api/test_cast_api.cc
index 276a70066ba73..5448fb9d42470 100644
--- a/paddle/phi/tests/api/test_cast_api.cc
+++ b/paddle/phi/tests/api/test_cast_api.cc
@@ -21,6 +21,9 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(cast, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_concat_api.cc b/paddle/phi/tests/api/test_concat_api.cc
index d5a36f56bfa1b..824b72b97ac12 100644
--- a/paddle/phi/tests/api/test_concat_api.cc
+++ b/paddle/phi/tests/api/test_concat_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(concat, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_conj_api.cc b/paddle/phi/tests/api/test_conj_api.cc
index 9c438de9297fb..62a588dff1280 100644
--- a/paddle/phi/tests/api/test_conj_api.cc
+++ b/paddle/phi/tests/api/test_conj_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(conj, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_data_transform.cc b/paddle/phi/tests/api/test_data_transform.cc
index a3c497bd427ae..dd008ff36d50a 100644
--- a/paddle/phi/tests/api/test_data_transform.cc
+++ b/paddle/phi/tests/api/test_data_transform.cc
@@ -19,6 +19,16 @@ limitations under the License. */
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+#endif
 
 namespace paddle {
 namespace tests {
diff --git a/paddle/phi/tests/api/test_dot_api.cc b/paddle/phi/tests/api/test_dot_api.cc
index e48004653d638..3fcd4e8a01d12 100644
--- a/paddle/phi/tests/api/test_dot_api.cc
+++ b/paddle/phi/tests/api/test_dot_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(dot, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_elementwise_api.cc b/paddle/phi/tests/api/test_elementwise_api.cc
index cebf4e003aafa..d4013a788c76c 100644
--- a/paddle/phi/tests/api/test_elementwise_api.cc
+++ b/paddle/phi/tests/api/test_elementwise_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_empty_api.cc b/paddle/phi/tests/api/test_empty_api.cc
index dc5618f0aae8a..48adbe1bd2682 100644
--- a/paddle/phi/tests/api/test_empty_api.cc
+++ b/paddle/phi/tests/api/test_empty_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(empty, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_fill_api.cc b/paddle/phi/tests/api/test_fill_api.cc
index 9b434aef81195..bf57574d39093 100644
--- a/paddle/phi/tests/api/test_fill_api.cc
+++ b/paddle/phi/tests/api/test_fill_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_flatten_api.cc b/paddle/phi/tests/api/test_flatten_api.cc
index e1360e8e27bff..f1c8935e26640 100644
--- a/paddle/phi/tests/api/test_flatten_api.cc
+++ b/paddle/phi/tests/api/test_flatten_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(flatten, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_matmul_api.cc b/paddle/phi/tests/api/test_matmul_api.cc
index 2a3dd9c7dff62..e2c324a6775c8 100644
--- a/paddle/phi/tests/api/test_matmul_api.cc
+++ b/paddle/phi/tests/api/test_matmul_api.cc
@@ -26,6 +26,15 @@ limitations under the License. */
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_double_grad, CPU, ALL_LAYOUT);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+#endif
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_mean_api.cc b/paddle/phi/tests/api/test_mean_api.cc
index 53be1b1e9dc9c..af47f2cd7714a 100644
--- a/paddle/phi/tests/api/test_mean_api.cc
+++ b/paddle/phi/tests/api/test_mean_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(mean, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_pten_tensor.cc b/paddle/phi/tests/api/test_pten_tensor.cc
index dc2883c1794e2..74ed648f3ee6e 100644
--- a/paddle/phi/tests/api/test_pten_tensor.cc
+++ b/paddle/phi/tests/api/test_pten_tensor.cc
@@ -16,6 +16,13 @@
 #include "gtest/gtest.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/ext_compat_utils.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+#endif
 
 namespace paddle {
 namespace tests {
diff --git a/paddle/phi/tests/api/test_reshape_api.cc b/paddle/phi/tests/api/test_reshape_api.cc
index 60281a9f49923..4a857e2d1dcda 100644
--- a/paddle/phi/tests/api/test_reshape_api.cc
+++ b/paddle/phi/tests/api/test_reshape_api.cc
@@ -21,6 +21,9 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(reshape, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_scale_api.cc b/paddle/phi/tests/api/test_scale_api.cc
index 52e8ae630e0e5..a40ecc8485e4a 100644
--- a/paddle/phi/tests/api/test_scale_api.cc
+++ b/paddle/phi/tests/api/test_scale_api.cc
@@ -19,8 +19,13 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/selected_rows.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(scale_sr, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_scale_benchmark.cc b/paddle/phi/tests/api/test_scale_benchmark.cc
index 9c0b0fc576ebc..05a5563344966 100644
--- a/paddle/phi/tests/api/test_scale_benchmark.cc
+++ b/paddle/phi/tests/api/test_scale_benchmark.cc
@@ -19,9 +19,12 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/tests/api/scale_api.h"
 #include "paddle/phi/tests/core/timer.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_slice_api.cc b/paddle/phi/tests/api/test_slice_api.cc
index c3f5fdcb36d62..ee2ade0229f1f 100644
--- a/paddle/phi/tests/api/test_slice_api.cc
+++ b/paddle/phi/tests/api/test_slice_api.cc
@@ -19,6 +19,8 @@ limitations under the License. */
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_sparse_conv_api.cc b/paddle/phi/tests/api/test_sparse_conv_api.cc
index 16d7cb66f4cc5..7c4aa16425907 100644
--- a/paddle/phi/tests/api/test_sparse_conv_api.cc
+++ b/paddle/phi/tests/api/test_sparse_conv_api.cc
@@ -24,6 +24,8 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 
+PD_DECLARE_KERNEL(sparse_conv3d, CPU, ALL_LAYOUT);
+
 template <typename T>
 void TestConv3dBase(const std::vector<int>& indices,
                     const std::vector<T>& features,
@@ -76,7 +78,7 @@ void TestConv3dBase(const std::vector<int>& indices,
 
   if (!std::is_same<T, phi::dtype::float16>::value) {
     auto outs = paddle::experimental::sparse::conv3d(
-        x, weight, paddings, dilations, strides, 1);
+        x, weight, paddings, dilations, strides, 1, false);
 
     auto out = std::dynamic_pointer_cast<phi::SparseCooTensor>(
         std::get<0>(outs).impl());
diff --git a/paddle/phi/tests/api/test_sparse_utils_api.cc b/paddle/phi/tests/api/test_sparse_utils_api.cc
index 819122a9b3665..da66334ced78a 100644
--- a/paddle/phi/tests/api/test_sparse_utils_api.cc
+++ b/paddle/phi/tests/api/test_sparse_utils_api.cc
@@ -24,6 +24,8 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 
+PD_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT);
+
 TEST(API, to_sparse_coo) {
   const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
@@ -51,8 +53,7 @@ TEST(API, to_sparse_coo) {
 
   // 1. test dense_to_sparse_coo
   paddle::experimental::Tensor x(dense_x);
-  auto out = paddle::experimental::sparse::to_sparse_coo(
-      x, phi::Backend::CPU, sparse_dim);
+  auto out = paddle::experimental::sparse::to_sparse_coo(x, sparse_dim);
   auto coo = std::dynamic_pointer_cast<phi::SparseCooTensor>(out.impl());
   ASSERT_EQ(coo->nnz(), non_zero_num);
   int cmp_indices = memcmp(coo->non_zero_indices().data<int64_t>(),
@@ -89,8 +90,7 @@ TEST(API, to_sparse_coo) {
   auto csr =
       std::make_shared<phi::SparseCsrTensor>(crows, cols, values, dense_dims);
   paddle::experimental::Tensor csr_x(csr);
-  auto out2 = paddle::experimental::sparse::to_sparse_coo(
-      csr_x, phi::Backend::CPU, sparse_dim);
+  auto out2 = paddle::experimental::sparse::to_sparse_coo(csr_x, sparse_dim);
 
   auto coo2 = std::dynamic_pointer_cast<phi::SparseCooTensor>(out.impl());
   ASSERT_EQ(coo2->nnz(), non_zero_num);
@@ -130,7 +130,7 @@ TEST(API, to_sparse_csr) {
 
   // 1. test dense_to_sparse_csr
   paddle::experimental::Tensor x(dense_x);
-  auto out = paddle::experimental::sparse::to_sparse_csr(x, phi::Backend::CPU);
+  auto out = paddle::experimental::sparse::to_sparse_csr(x);
   auto csr = std::dynamic_pointer_cast<phi::SparseCsrTensor>(out.impl());
   auto check = [&](const phi::SparseCsrTensor& csr) {
     ASSERT_EQ(csr.non_zero_cols().numel(), non_zero_num);
@@ -168,8 +168,7 @@ TEST(API, to_sparse_csr) {
   auto coo =
       std::make_shared<phi::SparseCooTensor>(indices, values, dense_dims);
   paddle::experimental::Tensor coo_x(coo);
-  auto out2 =
-      paddle::experimental::sparse::to_sparse_csr(coo_x, phi::Backend::CPU);
+  auto out2 = paddle::experimental::sparse::to_sparse_csr(coo_x);
 
   auto csr2 = std::dynamic_pointer_cast<phi::SparseCsrTensor>(out.impl());
   check(*csr2);
@@ -210,7 +209,7 @@ TEST(API, to_dense) {
       std::make_shared<phi::SparseCooTensor>(indices, values, dense_dims);
 
   paddle::experimental::Tensor coo_x(coo);
-  auto out = paddle::experimental::sparse::to_dense(coo_x, phi::Backend::CPU);
+  auto out = paddle::experimental::sparse::to_dense(coo_x);
   auto dense_out = std::dynamic_pointer_cast<phi::DenseTensor>(out.impl());
   int cmp1 =
       memcmp(dense_out->data<float>(), &dense_data[0][0], 9 * sizeof(float));
@@ -235,7 +234,7 @@ TEST(API, to_dense) {
   auto csr =
       std::make_shared<phi::SparseCsrTensor>(crows, cols, values, dense_dims);
   paddle::experimental::Tensor csr_x(csr);
-  auto out2 = paddle::experimental::sparse::to_dense(csr_x, phi::Backend::CPU);
+  auto out2 = paddle::experimental::sparse::to_dense(csr_x);
 
   auto dense_out2 = std::dynamic_pointer_cast<phi::DenseTensor>(out.impl());
   int cmp2 =
diff --git a/paddle/phi/tests/api/test_split_api.cc b/paddle/phi/tests/api/test_split_api.cc
index 0b836a010586d..1b84e7793cf6a 100644
--- a/paddle/phi/tests/api/test_split_api.cc
+++ b/paddle/phi/tests/api/test_split_api.cc
@@ -21,6 +21,8 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(split, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_sum_api.cc b/paddle/phi/tests/api/test_sum_api.cc
index 80620b8e61c57..9781d70d2b913 100644
--- a/paddle/phi/tests/api/test_sum_api.cc
+++ b/paddle/phi/tests/api/test_sum_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_to_api.cc b/paddle/phi/tests/api/test_to_api.cc
index d337a0b601a00..66c478e4c0001 100644
--- a/paddle/phi/tests/api/test_to_api.cc
+++ b/paddle/phi/tests/api/test_to_api.cc
@@ -21,6 +21,11 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+#endif
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/core/test_meta_fn_utils.cc b/paddle/phi/tests/core/test_meta_fn_utils.cc
index f4288c2aa2f94..399112d09c2ad 100644
--- a/paddle/phi/tests/core/test_meta_fn_utils.cc
+++ b/paddle/phi/tests/core/test_meta_fn_utils.cc
@@ -52,7 +52,7 @@ TEST(MetaFnFactory, InferMetaFnExists) {
   phi::InferMetaContext ctx;
   ctx.EmplaceBackInput(shared_meat_x);
   ctx.EmplaceBackOutput(shared_meta_out);
-  ctx.SetMetaConfig(/*is_runtime=*/true);
+  ctx.SetMetaConfig({/*is_runtime =*/true, /*is_run_mkldnn_kernel=*/false});
   phi::MetaFnFactory::Instance().Get("sign")(&ctx);
 
   EXPECT_EQ(dense_out1.dims().size(), dense_out2.dims().size());
@@ -78,7 +78,7 @@ TEST(MetaFnFactory, CopyInferMetaFn) {
   ctx.EmplaceBackAttr(Backend::CPU);
   ctx.EmplaceBackAttr(false);
   ctx.EmplaceBackOutput(shared_meta_out);
-  ctx.SetMetaConfig(/*is_runtime=*/true);
+  ctx.SetMetaConfig({/*is_runtime =*/true, /*is_run_mkldnn_kernel=*/false});
   phi::MetaFnFactory::Instance().Get("copy_to")(&ctx);
 
   EXPECT_EQ(dense_out1.dims().size(), dense_out2.dims().size());
@@ -105,7 +105,7 @@ TEST(MetaFnFactory, SplitInferMetaFn) {
   ctx.EmplaceBackAttr(num_or_sections);
   ctx.EmplaceBackAttr(axis);
   ctx.EmplaceBackOutputs(out);
-  ctx.SetMetaConfig(/*is_runtime=*/true);
+  ctx.SetMetaConfig({/*is_runtime =*/true, /*is_run_mkldnn_kernel=*/false});
   phi::MetaFnFactory::Instance().Get("split")(&ctx);
 
   ASSERT_EQ(dense_out1.dims().size(), 2);
diff --git a/paddle/phi/tests/kernels/CMakeLists.txt b/paddle/phi/tests/kernels/CMakeLists.txt
index 317dcce92c8ed..3897c182e481c 100644
--- a/paddle/phi/tests/kernels/CMakeLists.txt
+++ b/paddle/phi/tests/kernels/CMakeLists.txt
@@ -14,6 +14,7 @@ cc_test(test_concat_dev_api SRCS test_concat_dev_api.cc DEPS phi phi_api_utils)
 cc_test(test_split_dev_api SRCS test_split_dev_api.cc DEPS phi phi_api_utils)
 cc_test(test_sparse_utils_dev_api SRCS test_sparse_utils_dev_api.cc DEPS phi phi_api_utils)
 cc_test(test_sparse_conv3d_dev_api SRCS test_sparse_conv3d_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_sparse_pool_dev_api SRCS test_sparse_pool_dev_api.cc DEPS phi phi_api_utils)
 
 cc_test(test_math_function SRCS test_math_function.cc DEPS math_function)
 if(WITH_GPU)
diff --git a/paddle/phi/tests/kernels/test_copy_dev_api.cc b/paddle/phi/tests/kernels/test_copy_dev_api.cc
index d69c7b2174f72..460d85f83133f 100644
--- a/paddle/phi/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_copy_dev_api.cc
@@ -61,6 +61,10 @@ TEST(DEV_API, copy) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
+  dev_ctx.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   dev_ctx.Init();
   phi::Copy(
       dev_ctx, *(dense_src.get()), phi::CPUPlace(), false, dense_dst.get());
diff --git a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
index 3e5f965074156..9552c02976f30 100644
--- a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
diff --git a/paddle/phi/tests/kernels/test_flatten_dev_api.cc b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
index dc283728ee5f7..e3f2e8b57e3df 100644
--- a/paddle/phi/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
@@ -58,6 +58,10 @@ TEST(DEV_API, flatten) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
+  dev_ctx.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   dev_ctx.Init();
 
   // 2. test API
diff --git a/paddle/phi/tests/kernels/test_mean_dev_api.cc b/paddle/phi/tests/kernels/test_mean_dev_api.cc
index 23edfeacaf814..ce31b2021e01a 100644
--- a/paddle/phi/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_mean_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
diff --git a/paddle/phi/tests/kernels/test_reshape_dev_api.cc b/paddle/phi/tests/kernels/test_reshape_dev_api.cc
index 16ad4fc341be0..7de039372fa9c 100644
--- a/paddle/phi/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_reshape_dev_api.cc
@@ -50,6 +50,10 @@ TEST(DEV_API, reshape) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
+  dev_ctx.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   dev_ctx.Init();
   auto out = phi::Reshape<float>(dev_ctx, dense_x, shape);
   // 3. check result
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index c1a8b853b32e3..4800e1402ba56 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -64,7 +64,8 @@ void TestConv3dBase(const std::vector<int>& indices,
                     const float diff = 1e-3,
                     const bool backward = false,
                     const std::vector<T> features_grad = {},
-                    const std::vector<T> kernel_grad = {}) {
+                    const std::vector<T> kernel_grad = {},
+                    const bool subm = false) {
   phi::CPUContext dev_ctx_cpu;
   dev_ctx_cpu.SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
@@ -114,6 +115,7 @@ void TestConv3dBase(const std::vector<int>& indices,
                                             dilations,
                                             strides,
                                             1,
+                                            subm,
                                             &rulebook);
 
     ASSERT_EQ(correct_out_dims.size(), out.dims().size());
@@ -130,15 +132,17 @@ void TestConv3dBase(const std::vector<int>& indices,
     f_verify(out.non_zero_elements().data<T>(), correct_out_features);
 
     if (backward) {
-      std::vector<DenseTensor> grads = sparse::Conv3dGrad<T>(dev_ctx_cpu,
-                                                             x_tensor,
-                                                             rulebook,
-                                                             kernel_tensor,
-                                                             out,
-                                                             paddings,
-                                                             dilations,
-                                                             strides,
-                                                             1);
+      std::vector<DenseTensor> grads =
+          sparse::Conv3dGrad<T>(dev_ctx_cpu,
+                                x_tensor,
+                                rulebook,
+                                kernel_tensor,
+                                out.non_zero_elements(),
+                                paddings,
+                                dilations,
+                                strides,
+                                1,
+                                subm);
       f_verify(grads[0].data<T>(), features_grad);
       f_verify(grads[1].data<T>(), kernel_grad);
     }
@@ -191,6 +195,7 @@ void TestConv3dBase(const std::vector<int>& indices,
                                             dilations,
                                             strides,
                                             1,
+                                            subm,
                                             &d_rulebook);
 
   ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
@@ -227,15 +232,17 @@ void TestConv3dBase(const std::vector<int>& indices,
   f_verify(h_features_tensor.data<T>(), correct_out_features);
 
   if (backward) {
-    std::vector<DenseTensor> grads = sparse::Conv3dGrad<T>(dev_ctx_gpu,
-                                                           d_x_tensor,
-                                                           d_rulebook,
-                                                           d_kernel_tensor,
-                                                           d_out,
-                                                           paddings,
-                                                           dilations,
-                                                           strides,
-                                                           1);
+    std::vector<DenseTensor> grads =
+        sparse::Conv3dGrad<T>(dev_ctx_gpu,
+                              d_x_tensor,
+                              d_rulebook,
+                              d_kernel_tensor,
+                              d_out.non_zero_elements(),
+                              paddings,
+                              dilations,
+                              strides,
+                              1,
+                              subm);
     DenseTensor h_features_grad = phi::Empty(
         dev_ctx_cpu,
         DenseTensorMeta(grads[0].dtype(), grads[0].dims(), grads[0].layout()));
@@ -266,7 +273,8 @@ void TestConv3d(const std::vector<int>& indices,
                 const float diff = 1e-3,
                 const bool backward = false,
                 const std::vector<float> features_grad = {},
-                const std::vector<float> kernel_grad = {}) {
+                const std::vector<float> kernel_grad = {},
+                const bool subm = false) {
   // test float
   TestConv3dBase<float>(indices,
                         features,
@@ -283,7 +291,8 @@ void TestConv3d(const std::vector<int>& indices,
                         diff,
                         backward,
                         features_grad,
-                        kernel_grad);
+                        kernel_grad,
+                        subm);
   // test double
   TestConv3dBase<double>(indices,
                          cast<float, double>(features),
@@ -300,7 +309,8 @@ void TestConv3d(const std::vector<int>& indices,
                          diff,
                          backward,
                          cast<float, double>(features_grad),
-                         cast<float, double>(kernel_grad));
+                         cast<float, double>(kernel_grad),
+                         subm);
 }
 
 TEST(DEV_API, sparse_conv3d) {
@@ -661,5 +671,101 @@ TEST(DEV_API, sparse_conv3d_backward) {
              kernel_grad);
 }
 
+TEST(DEV_API, sparse_conv2d_subm) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 1, 4, 5, in_channels};
+  DDim kernel_dims = {1, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 1, 4, 5, out_channels};
+  std::vector<int> paddings = {0, 1, 1};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 4;
+  std::vector<int> indices_flatten = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 3, 2, 2, 3};
+
+  std::vector<float> features = {0.8854, 0.6505, -0.1999, 0.3583};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.9364, 0.9460, 0.6564, 0.7999, 0.2013, 0.3812, 0.5474, 0.1016, 0.3368};
+
+  std::vector<int> out_indices_flatten = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 3, 2, 2, 3};
+
+  std::vector<float> out_features = {0.1782, 0.2313, 0.7117, 0.5214};
+
+  std::vector<float> features_grad = {0.0359, 1.2080, 0.5838, 0.4541};
+  std::vector<float> kernel_grad = {
+      0.3391, 0.4630, 0.0000, -0.1042, 0.3528, 0.2550, 0.0000, -0.0462, 0.0829};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations,
+             1e-3,
+             true,
+             features_grad,
+             kernel_grad,
+             true);
+}
+
+TEST(DEV_API, sparse_conv3d_subm) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 4, 4, 5, in_channels};
+  DDim kernel_dims = {3, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 4, 4, 5, out_channels};
+  std::vector<int> paddings = {1, 1, 1};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices_flatten = {0, 0, 0, 1, 3, 3, 2, 0, 2, 0, 3, 1};
+
+  std::vector<float> features = {-0.9578, 0.1572, 0.1036};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.1367, 0.4534, 0.2138, 0.8264, 0.7534, 0.3270, 0.2880, 0.1562, 0.7770,
+      0.6902, 0.1981, 0.1369, 0.6582, 0.7582, 0.5640, 0.8894, 0.7350, 0.1845,
+      0.6892, 0.3654, 0.6076, 0.0326, 0.8412, 0.5289, 0.9824, 0.8235, 0.9802};
+
+  std::vector<int> out_indices_flatten = {0, 0, 0, 1, 3, 3, 2, 0, 2, 0, 3, 1};
+
+  std::vector<float> out_features = {-0.7262, 0.1192, 0.0785};
+
+  std::vector<float> features_grad = {-0.5506, 0.0904, 0.0595};
+  std::vector<float> kernel_grad = {
+      0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+      0.0000, 0.0000, 0.0000, 0.0000, 0.7224, 0.0000, 0.0000, 0.0000, 0.0000,
+      0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations,
+             1e-3,
+             true,
+             features_grad,
+             kernel_grad,
+             true);
+}
+
 }  // namespace tests
 }  // namespace phi
diff --git a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
new file mode 100644
index 0000000000000..27673704168c9
--- /dev/null
+++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
@@ -0,0 +1,391 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
+
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace tests {
+
+template <typename T1, typename T2>
+std::vector<T2> cast(const std::vector<T1>& in) {
+  std::vector<T2> out(in.size());
+  for (uint64_t i = 0; i < in.size(); i++) {
+    out[i] = static_cast<T2>(in[i]);
+  }
+  return out;
+}
+template <typename T>
+void TestMaxPoolBase(const std::vector<int>& indices,
+                     const std::vector<T>& features,
+                     const DDim& x_dims,
+                     const std::vector<int>& correct_out_indices,
+                     const std::vector<T>& correct_out_features,
+                     const DDim& correct_out_dims,
+                     const int non_zero_num,
+                     const std::vector<int>& kernel_sizes,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& dilations,
+                     const float diff = 1e-3,
+                     const bool backward = false,
+                     const std::vector<T> features_grad = {}) {
+  phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+  dev_ctx_cpu.Init();
+
+  const int in_channels = x_dims[4];
+  const int out_channels = in_channels;
+
+  DenseTensor indices_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
+  memcpy(
+      indices_tensor.data<int>(), indices.data(), indices.size() * sizeof(int));
+  DenseTensor features_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {non_zero_num, in_channels},
+                      DataLayout::NHWC));
+  memcpy(
+      features_tensor.data<T>(), features.data(), features.size() * sizeof(T));
+
+  SparseCooTensor x_tensor(indices_tensor, features_tensor, x_dims);
+
+  auto f_verify = [&](const T* real_data, const std::vector<T>& correct_data) {
+    for (uint64_t i = 0; i < correct_data.size(); i++) {
+      float tmp = std::fabs(static_cast<float>(correct_data[i] - real_data[i]));
+      ASSERT_LT(tmp, diff);
+    }
+  };
+
+  if (!std::is_same<T, phi::dtype::float16>::value) {
+    DenseTensor rulebook = phi::Empty(
+        dev_ctx_cpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+    SparseCooTensor out = sparse::MaxPool<T>(dev_ctx_cpu,
+                                             x_tensor,
+                                             kernel_sizes,
+                                             paddings,
+                                             dilations,
+                                             strides,
+                                             &rulebook);
+
+    ASSERT_EQ(correct_out_dims.size(), out.dims().size());
+    for (int i = 0; i < correct_out_dims.size(); i++) {
+      ASSERT_EQ(correct_out_dims[i], out.dims()[i]);
+    }
+    ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out.nnz());
+
+    int cmp_indices = memcmp(correct_out_indices.data(),
+                             out.non_zero_indices().data<int>(),
+                             correct_out_indices.size() * sizeof(int));
+    ASSERT_EQ(cmp_indices, 0);
+
+    f_verify(out.non_zero_elements().data<T>(), correct_out_features);
+
+    if (backward) {
+      DenseTensor x_grad = sparse::MaxPoolGrad<T>(dev_ctx_cpu,
+                                                  x_tensor,
+                                                  rulebook,
+                                                  out,
+                                                  out.non_zero_elements(),
+                                                  kernel_sizes);
+      f_verify(x_grad.data<T>(), features_grad);
+    }
+  }
+
+// test gpu
+#if defined(PADDLE_WITH_CUDA)
+  phi::GPUContext dev_ctx_gpu;
+  dev_ctx_gpu.PartialInitWithoutAllocator();
+  dev_ctx_gpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream())
+          .get());
+  dev_ctx_gpu.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
+  dev_ctx_gpu.PartialInitWithAllocator();
+
+  DenseTensor d_indices_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
+  phi::Copy(
+      dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor);
+
+  DenseTensor d_features_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {non_zero_num, in_channels},
+                      DataLayout::NHWC));
+  phi::Copy(
+      dev_ctx_gpu, features_tensor, phi::GPUPlace(), true, &d_features_tensor);
+
+  SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims);
+
+  DenseTensor d_rulebook = phi::Empty(
+      dev_ctx_gpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+  SparseCooTensor d_out = sparse::MaxPool<T>(dev_ctx_gpu,
+                                             d_x_tensor,
+                                             kernel_sizes,
+                                             paddings,
+                                             dilations,
+                                             strides,
+                                             &d_rulebook);
+
+  ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
+  ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz());
+  for (int i = 0; i < correct_out_dims.size(); i++) {
+    ASSERT_EQ(correct_out_dims[i], d_out.dims()[i]);
+  }
+
+  DenseTensor h_indices_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW));
+  phi::Copy(dev_ctx_gpu,
+            d_out.non_zero_indices(),
+            phi::CPUPlace(),
+            true,
+            &h_indices_tensor);
+
+  int cmp_indices2 = memcmp(correct_out_indices.data(),
+                            h_indices_tensor.data<int>(),
+                            correct_out_indices.size() * sizeof(int));
+  ASSERT_EQ(cmp_indices2, 0);
+
+  DenseTensor h_features_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {d_out.nnz()},
+                      d_out.layout()));
+
+  phi::Copy(dev_ctx_gpu,
+            d_out.non_zero_elements(),
+            phi::CPUPlace(),
+            true,
+            &h_features_tensor);
+  f_verify(h_features_tensor.data<T>(), correct_out_features);
+
+  if (backward) {
+    DenseTensor x_grad = sparse::MaxPoolGrad<T>(dev_ctx_gpu,
+                                                d_x_tensor,
+                                                d_rulebook,
+                                                d_out,
+                                                d_out.non_zero_elements(),
+                                                kernel_sizes);
+    DenseTensor h_features_grad = phi::Empty(
+        dev_ctx_cpu,
+        DenseTensorMeta(x_grad.dtype(), x_grad.dims(), x_grad.layout()));
+    phi::Copy(dev_ctx_gpu, x_grad, phi::CPUPlace(), true, &h_features_grad);
+    f_verify(h_features_grad.data<T>(), features_grad);
+  }
+#endif
+}
+
+void TestMaxPool(const std::vector<int>& indices,
+                 const std::vector<float>& features,
+                 const DDim& x_dims,
+                 const std::vector<int>& correct_out_indices,
+                 const std::vector<float>& correct_out_features,
+                 const DDim& correct_out_dims,
+                 const int non_zero_num,
+                 const std::vector<int>& kernel_sizes,
+                 const std::vector<int>& paddings,
+                 const std::vector<int>& strides,
+                 const std::vector<int>& dilations,
+                 const float diff = 1e-3,
+                 const bool backward = false,
+                 const std::vector<float> features_grad = {}) {
+  // test float
+  TestMaxPoolBase<float>(indices,
+                         features,
+                         x_dims,
+                         correct_out_indices,
+                         correct_out_features,
+                         correct_out_dims,
+                         non_zero_num,
+                         kernel_sizes,
+                         paddings,
+                         strides,
+                         dilations,
+                         diff,
+                         backward,
+                         features_grad);
+  // test double
+  TestMaxPoolBase<double>(indices,
+                          cast<float, double>(features),
+                          x_dims,
+                          correct_out_indices,
+                          cast<float, double>(correct_out_features),
+                          correct_out_dims,
+                          non_zero_num,
+                          kernel_sizes,
+                          paddings,
+                          strides,
+                          dilations,
+                          diff,
+                          backward,
+                          cast<float, double>(features_grad));
+}
+
+TEST(DEV_API, sparse_maxpool) {
+  const int channels = 1;
+  DDim x_dims = {1, 1, 4, 4, channels};
+  DDim out_dims = {1, 1, 2, 2, channels};
+  std::vector<int> kernel_sizes = {1, 3, 3};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2};
+  std::vector<float> features = {1, 2, 3};
+  std::vector<int> out_indices = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
+  };
+  std::vector<float> out_features = {2, 2, 3, 3};
+  std::vector<float> x_grad = {0, 4, 6};
+
+  TestMaxPool(indices,
+              features,
+              x_dims,
+              out_indices,
+              out_features,
+              out_dims,
+              non_zero_num,
+              kernel_sizes,
+              paddings,
+              strides,
+              dilations,
+              1e-6,
+              true,
+              x_grad);
+}
+
+TEST(DEV_API, sparse_maxpool_stride) {
+  const int channels = 1;
+  DDim x_dims = {1, 1, 4, 4, channels};
+  DDim out_dims = {1, 1, 1, 1, channels};
+  std::vector<int> kernel_sizes = {1, 3, 3};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {2, 2, 2};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2};
+  std::vector<float> features = {1, 2, 3};
+  std::vector<int> out_indices = {0, 0, 0, 0};
+  std::vector<float> out_features = {2};
+  std::vector<float> x_grad = {0, 2, 0};
+
+  TestMaxPool(indices,
+              features,
+              x_dims,
+              out_indices,
+              out_features,
+              out_dims,
+              non_zero_num,
+              kernel_sizes,
+              paddings,
+              strides,
+              dilations,
+              1e-6,
+              true,
+              x_grad);
+}
+
+TEST(DEV_API, sparse_maxpool_channel) {
+  const int channels = 2;
+  DDim x_dims = {1, 1, 4, 4, channels};
+  DDim out_dims = {1, 1, 2, 2, channels};
+  std::vector<int> kernel_sizes = {1, 3, 3};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2};
+  std::vector<float> features = {1, 1, 2, 2, 3, 3};
+  std::vector<int> out_indices = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
+  };
+  std::vector<float> out_features = {2, 2, 2, 2, 3, 3, 3, 3};
+  std::vector<float> x_grad = {0, 0, 4, 4, 6, 6};
+
+  TestMaxPool(indices,
+              features,
+              x_dims,
+              out_indices,
+              out_features,
+              out_dims,
+              non_zero_num,
+              kernel_sizes,
+              paddings,
+              strides,
+              dilations,
+              1e-6,
+              true,
+              x_grad);
+}
+
+TEST(DEV_API, sparse_maxpool3d) {
+  const int channels = 2;
+  DDim x_dims = {1, 5, 4, 4, channels};
+  DDim out_dims = {1, 3, 2, 2, channels};
+  std::vector<int> kernel_sizes = {3, 3, 3};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2};
+  std::vector<float> features = {1, 1, 2, 2, 3, 3};
+  std::vector<int> out_indices = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
+  };
+  std::vector<float> out_features = {2, 2, 2, 2, 3, 3, 3, 3};
+  std::vector<float> x_grad = {0, 0, 4, 4, 6, 6};
+
+  TestMaxPool(indices,
+              features,
+              x_dims,
+              out_indices,
+              out_features,
+              out_dims,
+              non_zero_num,
+              kernel_sizes,
+              paddings,
+              strides,
+              dilations,
+              1e-6,
+              true,
+              x_grad);
+}
+
+}  // namespace tests
+}  // namespace phi
diff --git a/paddle/phi/tests/kernels/test_sum_dev_api.cc b/paddle/phi/tests/kernels/test_sum_dev_api.cc
index dfec291bc072f..82fa90c1574bd 100644
--- a/paddle/phi/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sum_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
diff --git a/paddle/phi/tests/ops/test_op_signature.cc b/paddle/phi/tests/ops/test_op_signature.cc
index 88c9193a8f894..36923972ea414 100644
--- a/paddle/phi/tests/ops/test_op_signature.cc
+++ b/paddle/phi/tests/ops/test_op_signature.cc
@@ -484,5 +484,98 @@ TEST(ARG_MAP, set_value) {
       "set_value");
 }
 
+TEST(ARG_MAP, set_value_grad) {
+  TestArgumentMappingContext arg_case(
+      {"Out@GRAD", "StartsTensorList", "EndsTensorList"},
+      {},
+      {},
+      {"Input@GRAD", "ValueTensor@GRAD"},
+      {});
+  ASSERT_EQ(OpUtilsMap::Instance()
+                .GetArgumentMappingFn("set_value_grad")(arg_case)
+                .name,
+            "set_value_grad");
+
+  TestArgumentMappingContext arg_case1(
+      {"Out@GRAD", "StartsTensorList", "StepsTensorList"},
+      {},
+      {},
+      {"Input@GRAD", "ValueTensor@GRAD"},
+      {});
+  ASSERT_EQ(OpUtilsMap::Instance()
+                .GetArgumentMappingFn("set_value_grad")(arg_case1)
+                .name,
+            "set_value_grad");
+
+  TestArgumentMappingContext arg_case2({"Out@GRAD", "StartsTensorList"},
+                                       {},
+                                       {},
+                                       {"Input@GRAD", "ValueTensor@GRAD"},
+                                       {});
+  ASSERT_EQ(OpUtilsMap::Instance()
+                .GetArgumentMappingFn("set_value_grad")(arg_case2)
+                .name,
+            "set_value_grad");
+
+  TestArgumentMappingContext arg_case3(
+      {"Out@GRAD", "EndsTensorList", "StepsTensorList"},
+      {},
+      {},
+      {"Input@GRAD", "ValueTensor@GRAD"},
+      {});
+  ASSERT_EQ(OpUtilsMap::Instance()
+                .GetArgumentMappingFn("set_value_grad")(arg_case3)
+                .name,
+            "set_value_grad");
+
+  TestArgumentMappingContext arg_case4({"Out@GRAD", "EndsTensorList"},
+                                       {},
+                                       {},
+                                       {"Input@GRAD", "ValueTensor@GRAD"},
+                                       {});
+  ASSERT_EQ(OpUtilsMap::Instance()
+                .GetArgumentMappingFn("set_value_grad")(arg_case4)
+                .name,
+            "set_value_grad");
+
+  TestArgumentMappingContext arg_case5({"Out@GRAD", "StepsTensorList"},
+                                       {},
+                                       {},
+                                       {"Input@GRAD", "ValueTensor@GRAD"},
+                                       {});
+  ASSERT_EQ(OpUtilsMap::Instance()
+                .GetArgumentMappingFn("set_value_grad")(arg_case5)
+                .name,
+            "set_value_grad");
+}
+
+TEST(ARG_MAP, allclose) {
+  TestArgumentMappingContext arg_case1(
+      {"Input", "Other", "Rtol"},
+      {},
+      {{"atol", paddle::any(std::string{"1e-8"})},
+       {"equal_nan", paddle::any(false)}},
+      {"Out"},
+      {});
+  auto signature1 =
+      OpUtilsMap::Instance().GetArgumentMappingFn("allclose")(arg_case1);
+  ASSERT_EQ(signature1.name, "allclose");
+  auto attr_names1 = std::get<1>(signature1.args);
+  ASSERT_EQ(attr_names1[0], "Rtol");
+
+  TestArgumentMappingContext arg_case2(
+      {"Input", "Other", "Atol"},
+      {},
+      {{"rtol", paddle::any(std::string{"1e-5"})},
+       {"equal_nan", paddle::any(false)}},
+      {"Out"},
+      {});
+  auto signature2 =
+      OpUtilsMap::Instance().GetArgumentMappingFn("allclose")(arg_case2);
+  ASSERT_EQ(signature2.name, "allclose");
+  auto attr_names2 = std::get<1>(signature2.args);
+  ASSERT_EQ(attr_names2[1], "Atol");
+}
+
 }  // namespace tests
 }  // namespace phi
diff --git a/paddle/phi/tests/ops/test_op_signature.h b/paddle/phi/tests/ops/test_op_signature.h
index 06048f33d940a..8468dad10eb64 100644
--- a/paddle/phi/tests/ops/test_op_signature.h
+++ b/paddle/phi/tests/ops/test_op_signature.h
@@ -72,6 +72,11 @@ class TestArgumentMappingContext : public phi::ArgumentMappingContext {
     return selected_rows_inputs.count(name) > 0;
   }
 
+  // add member if needed
+  bool IsDenseTensorVectorInput(const std::string& name) const override {
+    return false;
+  }
+
   bool IsDenseTensorOutput(const std::string& name) const override {
     return dense_tensor_outputs.count(name) > 0;
   }
diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh
index 0ba2dae90967c..1b259023f94df 100755
--- a/paddle/scripts/infrt_build.sh
+++ b/paddle/scripts/infrt_build.sh
@@ -32,7 +32,7 @@ function update_pd_ops() {
    # compile and install paddle
    rm -rf ${PADDLE_ROOT}/build && mkdir -p ${PADDLE_ROOT}/build
    cd ${PADDLE_ROOT}/build
-   cmake .. -DWITH_PYTHON=ON -DWITH_GPU=OFF -DPYTHON_EXECUTABLE=`which python3` -DWITH_XBYAK=OFF -DWITH_NCCL=OFF -DWITH_RCCL=OFF -DWITH_CRYPTO=OFF
+   cmake .. -DWITH_PYTHON=ON -DWITH_MKL=OFF -DWITH_GPU=OFF -DPYTHON_EXECUTABLE=`which python3` -DWITH_XBYAK=OFF -DWITH_NCCL=OFF -DWITH_RCCL=OFF -DWITH_CRYPTO=OFF
    make -j8 paddle_python print_pten_kernels kernel_signature_generator
    cd ${PADDLE_ROOT}/build
    ./paddle/phi/tools/print_pten_kernels > ../tools/infrt/kernels.json
@@ -44,6 +44,9 @@ function update_pd_ops() {
    cd ${PADDLE_ROOT}/tools/infrt/
    python3 generate_pd_op_dialect_from_paddle_op_maker.py
    python3 generate_phi_kernel_dialect.py
+   # generate test model
+   cd ${PADDLE_ROOT}
+   python3 paddle/infrt/tests/model/abs_model.py ${PADDLE_ROOT}/build/paddle/infrt/tests/abs
 }
 
 function init() {
@@ -93,7 +96,7 @@ function infrt_gen_and_build() {
         exit 7;
     fi
 
-    make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-ir-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$?
+    make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$?
     if [ "$build_error" != 0 ];then
         exit 7;
     fi
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 35b2ce751b18f..78a863040ade1 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -55,7 +55,6 @@ wmic process where name="python.exe" call terminate 2>NUL
 
 rem ------initialize common variable------
 if not defined GENERATOR set GENERATOR="Visual Studio 15 2017 Win64"
-if not defined BRANCH set BRANCH=develop
 if not defined WITH_TENSORRT set WITH_TENSORRT=ON
 if not defined TENSORRT_ROOT set TENSORRT_ROOT=D:/TensorRT
 if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=Auto
@@ -66,10 +65,10 @@ if not defined WITH_TESTING set WITH_TESTING=ON
 if not defined MSVC_STATIC_CRT set MSVC_STATIC_CRT=ON
 if not defined WITH_PYTHON set WITH_PYTHON=ON
 if not defined ON_INFER set ON_INFER=ON
+if not defined WITH_ONNXRUNTIME set WITH_ONNXRUNTIME=OFF
 if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON
 if not defined WITH_STATIC_LIB set WITH_STATIC_LIB=ON
 if not defined WITH_TPCACHE set WITH_TPCACHE=OFF
-if not defined WITH_CLCACHE set WITH_CLCACHE=OFF
 if not defined WITH_CACHE set WITH_CACHE=OFF
 if not defined WITH_SCCACHE set WITH_SCCACHE=OFF
 if not defined WITH_UNITY_BUILD set WITH_UNITY_BUILD=OFF
@@ -144,17 +143,6 @@ if %day_now% NEQ %day_before% (
     echo %day_now% > %cache_dir%\day.txt
     type %cache_dir%\day.txt
     rmdir %BUILD_DIR% /s/q
-
-    : clear third party cache every once in a while
-    if %day_now% EQU 21 (
-        rmdir %cache_dir%\third_party /s/q
-    )
-    if %day_now% EQU 11 (
-        rmdir %cache_dir%\third_party /s/q
-    )
-    if %day_now% EQU 01 (
-        rmdir %cache_dir%\third_party /s/q
-    )
     goto :mkbuild
 )
 
@@ -211,6 +199,7 @@ echo There is not sccache in this PC, will install sccache.
 echo Download package from https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe
 %PYTHON_ROOT%\python.exe -c "import wget;wget.download('https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe')"
 xcopy sccache.exe %PYTHON_ROOT%\ /Y
+del sccache.exe
 goto:eof
 rem -------Caching strategy 2: End --------------------------------
 
@@ -231,13 +220,12 @@ set WITH_AVX=ON
 set MSVC_STATIC_CRT=OFF
 set ON_INFER=OFF
 set WITH_TENSORRT=ON
+set WITH_INFERENCE_API_TEST=OFF
 
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
 call :test_unit || goto test_unit_error
-:: call :test_inference || goto test_inference_error
-:: call :check_change_of_unittest || goto check_change_of_unittest_error
 goto:success
 
 rem ------PR CI windows check for OPENBLAS/CPU------
@@ -253,8 +241,6 @@ call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
 call :test_unit || goto test_unit_error
-:: call :test_inference || goto test_inference_error
-:: call :check_change_of_unittest || goto check_change_of_unittest_error
 goto:success
 
 rem ------PR CI windows check for unittests and inference in CUDA11-MKL-AVX----------
@@ -264,7 +250,6 @@ set WITH_GPU=ON
 set WITH_AVX=ON
 set MSVC_STATIC_CRT=ON
 set ON_INFER=ON
-set WITH_TESTING=ON
 set WITH_TENSORRT=ON
 set WITH_INFERENCE_API_TEST=ON
 
@@ -273,7 +258,8 @@ call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
 call :test_unit || goto test_unit_error
 ::call :test_inference || goto test_inference_error
-:: call :check_change_of_unittest || goto check_change_of_unittest_error
+::call :test_inference_ut || goto test_inference_ut_error
+call :check_change_of_unittest || goto check_change_of_unittest_error
 goto:success
 
 rem ------Build windows avx whl package------
@@ -364,18 +350,6 @@ if "%WITH_GPU%"=="ON" (
     nvidia-smi 2>NUL
 )
 
-rem ------pre install clcache and init config----------
-rem pip install clcache --user
-pip uninstall -y clcache
-:: set USE_CLCACHE to enable clcache
-rem set USE_CLCACHE=1
-:: In some scenarios, CLCACHE_HARDLINK can save one file copy.
-rem set CLCACHE_HARDLINK=1
-:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported
-rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
-:: set maximum cache size to 20G
-rem clcache.exe -M 21474836480
-
 rem ------set third_party cache dir------
 
 if "%WITH_TPCACHE%"=="OFF" (
@@ -383,6 +357,25 @@ if "%WITH_TPCACHE%"=="OFF" (
     goto :cmake_impl
 )
 
+rem clear third party cache every ten days
+for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
+set day_now=%datetime:~6,2%
+set day_before=-1
+set /p day_before=< %cache_dir%\day_third_party.txt
+if %day_now% NEQ %day_before% (
+    echo %day_now% > %cache_dir%\day_third_party.txt
+    type %cache_dir%\day_third_party.txt
+    if %day_now% EQU 21 (
+        rmdir %cache_dir%\third_party /s/q
+    )
+    if %day_now% EQU 11 (
+        rmdir %cache_dir%\third_party /s/q
+    )
+    if %day_now% EQU 01 (
+        rmdir %cache_dir%\third_party /s/q
+    )
+)
+
 echo set -ex > cache.sh
 echo md5_content=$(cat %work_dir:\=/%/cmake/external/*.cmake  ^|md5sum ^| awk '{print $1}') >> cache.sh
 echo echo ${md5_content}^>md5.txt >> cache.sh
@@ -534,11 +527,7 @@ echo Build Paddle the %build_times% time:
 if %GENERATOR% == "Ninja" (
     ninja all
 ) else (
-    if "%WITH_CLCACHE%"=="OFF" (
-        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
-    ) else (
-        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
-    )
+    MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
 )
 
 if %ERRORLEVEL% NEQ 0 (
@@ -757,7 +746,7 @@ for /F %%i in ("%libsize%") do (
 )
 
 cd /d %work_dir%\paddle\fluid\inference\api\demo_ci
-%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %WITH_TENSORRT% %TENSORRT_ROOT% %MSVC_STATIC_CRT%
+%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %WITH_TENSORRT% %TENSORRT_ROOT% %WITH_ONNXRUNTIME% %MSVC_STATIC_CRT%
 goto:eof
 
 :test_inference_error
@@ -773,77 +762,8 @@ echo    ========================================
 echo    Step 6. Check whether deleting a unit test ...
 echo    ========================================
 
-cd /d %work_dir%\%BUILD_DIR%
-echo set -e>  check_change_of_unittest.sh
-echo set +x>> check_change_of_unittest.sh
-echo GITHUB_API_TOKEN=%GITHUB_API_TOKEN% >>  check_change_of_unittest.sh
-echo GIT_PR_ID=%AGILE_PULL_ID% >>  check_change_of_unittest.sh
-echo BRANCH=%BRANCH%>>  check_change_of_unittest.sh
-echo if [ "${GITHUB_API_TOKEN}" == "" ] ^|^| [ "${GIT_PR_ID}" == "" ];then>> check_change_of_unittest.sh
-echo     exit 0 >>  check_change_of_unittest.sh
-echo fi>>  check_change_of_unittest.sh
-echo set -x>> check_change_of_unittest.sh
-echo cat ^<^<EOF>>  check_change_of_unittest.sh
-echo     ============================================ >>  check_change_of_unittest.sh
-echo     Generate unit tests.spec of this PR.         >>  check_change_of_unittest.sh
-echo     ============================================ >>  check_change_of_unittest.sh
-echo EOF>>  check_change_of_unittest.sh
-echo spec_path=$(pwd)/UNITTEST_PR.spec>>  check_change_of_unittest.sh
-echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>>  check_change_of_unittest.sh
-echo num=$(awk 'END{print NR}' ${spec_path})>> check_change_of_unittest.sh
-echo echo "Windows 1 card TestCases count is $num">> check_change_of_unittest.sh
-echo echo ipipe_log_param_Windows_1_Card_TestCases_Count: $num>> check_change_of_unittest.sh
-echo UPSTREAM_URL='https://github.com/PaddlePaddle/Paddle'>>  check_change_of_unittest.sh
-echo origin_upstream_url=`git remote -v ^| awk '{print $1, $2}' ^| uniq ^| grep upstream ^| awk '{print $2}'`>>  check_change_of_unittest.sh
-echo if [ "$origin_upstream_url" == "" ]; then>>  check_change_of_unittest.sh
-echo     git remote add upstream $UPSTREAM_URL.git>>  check_change_of_unittest.sh
-echo elif [ "$origin_upstream_url" ^!= "$UPSTREAM_URL" ] ^\>>  check_change_of_unittest.sh
-echo         ^&^& [ "$origin_upstream_url" ^!= "$UPSTREAM_URL.git" ]; then>>  check_change_of_unittest.sh
-echo     git remote remove upstream>>  check_change_of_unittest.sh
-echo     git remote add upstream $UPSTREAM_URL.git>>  check_change_of_unittest.sh
-echo fi>>  check_change_of_unittest.sh
-echo if [ ! -e "$(pwd)/../.git/refs/remotes/upstream/$BRANCH" ]; then>>  check_change_of_unittest.sh
-echo     git fetch upstream $BRANCH # develop is not fetched>>  check_change_of_unittest.sh
-echo fi>>  check_change_of_unittest.sh
-echo git checkout -b origin_pr >>  check_change_of_unittest.sh
-echo git checkout -f $BRANCH >>  check_change_of_unittest.sh
-echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
--DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DON_INFER=%ON_INFER% ^
--DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
--DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
--DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^
--DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME%  >>  check_change_of_unittest.sh
-echo cat ^<^<EOF>>  check_change_of_unittest.sh
-echo     ============================================       >>  check_change_of_unittest.sh
-echo     Generate unit tests.spec of develop.               >>  check_change_of_unittest.sh
-echo     ============================================       >>  check_change_of_unittest.sh
-echo EOF>>  check_change_of_unittest.sh
-echo spec_path=$(pwd)/UNITTEST_DEV.spec>>  check_change_of_unittest.sh
-echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>>  check_change_of_unittest.sh
-echo unittest_spec_diff=`python $(pwd)/../tools/diff_unittest.py $(pwd)/UNITTEST_DEV.spec $(pwd)/UNITTEST_PR.spec`>>  check_change_of_unittest.sh
-echo if [ "$unittest_spec_diff" ^!= "" ]; then>>  check_change_of_unittest.sh
-echo     set +x>> check_change_of_unittest.sh
-echo     approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`>>  check_change_of_unittest.sh
-echo     set -x>> check_change_of_unittest.sh
-echo     if [ "$approval_line" ^!= "" ]; then>>  check_change_of_unittest.sh
-echo         APPROVALS=`echo ${approval_line} ^|python $(pwd)/../tools/check_pr_approval.py 1 22165420 52485244 6836917`>>  check_change_of_unittest.sh
-echo         echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}">>  check_change_of_unittest.sh
-echo         if [ "${APPROVALS}" == "FALSE" ]; then>>  check_change_of_unittest.sh
-echo             echo "************************************"                >>  check_change_of_unittest.sh
-echo             echo -e "It is forbidden to disable or delete the unit-test.\n"        >>  check_change_of_unittest.sh
-echo             echo -e "If you must delete it temporarily, please add it to[https://github.com/PaddlePaddle/Paddle/wiki/Temporarily-disabled-Unit-Test]."     >>  check_change_of_unittest.sh
-echo             echo -e "Then you must have one RD (kolinwei(recommended) or zhouwei25) approval for the deletion of unit-test. \n"                 >>  check_change_of_unittest.sh
-echo             echo -e "If you have any problems about deleting unit-test, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/Deleting-unit-test-is-forbidden]. \n"   >>  check_change_of_unittest.sh
-echo             echo -e "Following unit-tests are deleted in this PR: \n ${unittest_spec_diff} \n"     >>  check_change_of_unittest.sh
-echo             echo "************************************"                >>  check_change_of_unittest.sh
-echo             exit 1 >>  check_change_of_unittest.sh
-echo          fi>>  check_change_of_unittest.sh
-echo     else>>  check_change_of_unittest.sh
-echo          exit 1 >>  check_change_of_unittest.sh
-echo     fi>>  check_change_of_unittest.sh
-echo fi>>  check_change_of_unittest.sh
-echo git checkout -f origin_pr >>  check_change_of_unittest.sh
-%cache_dir%\tools\busybox64.exe bash check_change_of_unittest.sh
+%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\check_change_of_unittest.sh
+
 goto:eof
 
 :check_change_of_unittest_error
@@ -857,7 +777,7 @@ echo    Step 7. Testing fluid library with infer_ut for inference ...
 echo    ========================================
 
 cd /d %work_dir%\paddle\fluid\inference\tests\infer_ut
-%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT% %MSVC_STATIC_CRT%
+%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT% %WITH_ONNXRUNTIME% %MSVC_STATIC_CRT%
 goto:eof
 
 :test_inference_ut_error
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 175b4be295ee3..39676b916e504 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -229,6 +229,7 @@ function cmake_base() {
         -DWITH_CNCL=${WITH_CNCL:-OFF}
         -DWITH_XPU=${WITH_XPU:-OFF}
         -DWITH_MLU=${WITH_MLU:-OFF}
+        -DWITH_IPU=${WITH_IPU:-OFF}
         -DLITE_GIT_TAG=release/v2.10
         -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF}
         -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF}
@@ -242,6 +243,7 @@ function cmake_base() {
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} 
         -DWITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF}
         -DCUDA_ARCH_BIN="${CUDA_ARCH_BIN}"
+        -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -279,6 +281,7 @@ EOF
         -DLITE_GIT_TAG=release/v2.10 \
         -DWITH_XPU=${WITH_XPU:-OFF} \
         -DWITH_MLU=${WITH_MLU:-OFF} \
+        -DWITH_IPU=${WITH_IPU:-OFF} \
         -DWITH_CNCL=${WITH_CNCL:-OFF} \
         -DXPU_SDK_ROOT=${XPU_SDK_ROOT:-""} \
         -DWITH_LITE=${WITH_LITE:-OFF} \
@@ -293,7 +296,9 @@ EOF
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
         -DCUDA_ARCH_BIN="${CUDA_ARCH_BIN}" \
         -DWITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF} \
-        -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF};build_error=$?
+        -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF}  \
+        -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF};build_error=$?
+        
     if [ "$build_error" != 0 ];then
         exit 7;
     fi
@@ -945,8 +950,17 @@ function generate_upstream_develop_api_spec() {
     git checkout .
     git checkout -b develop_base_pr upstream/$BRANCH
     startTime_firstBuild=`date +%s`
-    cmake_gen $1
-    build $2
+
+    dev_commit=`git log -1|head -1|awk '{print $2}'`
+    dev_url="https://xly-devops.bj.bcebos.com/PR/build_whl/0/${dev_commit}/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl"
+    url_return=`curl -s -m 5 -IL ${dev_url} |awk 'NR==1{print $2}'`
+    if [ "$url_return" == '200' ];then
+        mkdir -p ${PADDLE_ROOT}/build/python/dist && wget -q -P ${PADDLE_ROOT}/build/python/dist ${dev_url}
+    else
+        cmake_gen $1
+        build $2
+    fi
+
     cp ${PADDLE_ROOT}/python/requirements.txt /tmp
     pr_whl_size=`du -m ${PADDLE_ROOT}/build/python/dist/*.whl|awk '{print $1}'`
     echo "pr_whl_size: ${pr_whl_size}"
@@ -1271,6 +1285,8 @@ function card_test() {
         CUDA_DEVICE_COUNT=$(rocm-smi -i | grep GPU | wc -l)
     elif [ "${WITH_MLU}" == "ON" ];then
         CUDA_DEVICE_COUNT=1
+    elif [ "${WITH_IPU}" == "ON" ];then
+        CUDA_DEVICE_COUNT=1
     else
         CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
     fi
@@ -2228,6 +2244,130 @@ set -ex
     fi   
 }
 
+function parallel_test_base_ipu() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/ipu
+    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
+    cat <<EOF
+    ========================================
+    Running unit ipu tests ...
+    ========================================
+EOF
+
+set +x
+        test_cases=$(ctest -N -V) # get all test cases
+        get_quickly_disable_ut||disable_ut_quickly='disable_ut'   # indicate whether the case was in quickly disable list
+        while read -r line; do
+            if [[ "$line" == "" ]]; then
+                continue
+            fi
+            read testcase <<< $(echo "$line"|grep -oEi "\w+$")
+            if [[ "$single_card_tests" == "" ]]; then
+                single_card_tests="^$testcase$"
+            else
+                single_card_tests="$single_card_tests|^$testcase$"
+            fi
+        done <<< "$test_cases";
+
+        ut_actual_total_startTime_s=`date +%s`
+
+        card_test "$single_card_tests" 1 # run cases 1 job each time with single IPU
+        collect_failed_tests
+
+        # add unit test retry for IPU
+        rm -f $tmp_dir/*
+        exec_times=0
+        retry_unittests_record=''
+        retry_time=4
+        exec_time_array=('first' 'second' 'third' 'fourth')
+        parallel_failed_tests_exec_retry_threshold=120
+        exec_retry_threshold=30
+        is_retry_execuate=0
+        rerun_ut_startTime_s=`date +%s`
+        if [ -n "$failed_test_lists" ];then
+            if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then
+                bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists"    # cat logs for tiemout uts which killed by ctest
+            fi
+            need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            need_retry_ut_arr=(${need_retry_ut_str})
+            need_retry_ut_count=${#need_retry_ut_arr[@]}
+            retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            while ( [ $exec_times -lt $retry_time ] )
+                do
+                    if [[ "${exec_times}" == "0" ]] ;then
+                        if [ $need_retry_ut_count -lt $parallel_failed_tests_exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    elif [[ "${exec_times}" == "1" ]] ;then
+                        need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                        need_retry_ut_arr=(${need_retry_ut_str})
+                        need_retry_ut_count=${#need_retry_ut_arr[@]} 
+                        if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    fi
+                    if [[ "$is_retry_execuate" == "0" ]];then
+                        set +e
+                        retry_unittests_record="$retry_unittests_record$failed_test_lists"
+                        failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+                        set -e
+                        if [[ "${exec_times}" == "1" ]] || [[ "${exec_times}" == "3" ]];then
+                            if [[ "${failed_test_lists}" == "" ]];then
+                                break
+                            else
+                                retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                            fi
+                        fi
+                        echo "========================================="
+                        echo "This is the ${exec_time_array[$exec_times]} time to re-run"
+                        echo "========================================="
+                        echo "The following unittest will be re-run:"
+                        echo "${retry_unittests}"                    
+                        for line in ${retry_unittests[@]} ;
+                            do
+                                tmp_one_tmp="$( echo $single_card_tests | grep -oEi $line )"
+
+                                if [[ "$tmp_one_tmp" != ""  ]]; then
+                                    if [[ "$one_card_retry" == "" ]]; then
+                                        one_card_retry="^$line$"
+                                    else
+                                        one_card_retry="$one_card_retry|^$line$"
+                                    fi
+                                fi
+
+                            done
+
+                        if [[ "$one_card_retry" != "" ]]; then
+                            card_test "$one_card_retry" 1 # run cases 1 job each time with single GPU
+                        fi
+                        exec_times=$[$exec_times+1]
+                        failed_test_lists=''
+                        collect_failed_tests
+                        rm -f $tmp_dir/*
+                        one_card_retry=''
+                    else 
+                        break
+                    fi
+
+                done
+        fi
+
+        rerun_ut_endTime_s=`date +%s`
+        
+        echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        ut_actual_total_endTime_s=`date +%s`
+        echo "ipipe_log_param_actual_TestCases_Total_Time: $[ $ut_actual_total_endTime_s - $ut_actual_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        if [[ "$EXIT_CODE" != "0" ]]; then
+            show_ut_retry_result
+        fi
+set -ex
+    fi   
+}
+
 function parallel_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -2245,6 +2385,8 @@ function parallel_test() {
         parallel_test_base_npu
     elif [ "$WITH_MLU" == "ON" ];then
         parallel_test_base_mlu
+    elif [ "$WITH_IPU" == "ON" ];then
+        parallel_test_base_ipu
     else
         parallel_test_base_cpu ${PROC_RUN:-1}
     fi
@@ -2504,7 +2646,8 @@ EOF
     fi
     startTime_s=`date +%s`
     set +e
-    cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON -DWITH_TENSORRT=ON -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto} -DWITH_PYTHON=${WITH_PYTHON:-ON};build_error=$?
+
+    cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON -DWITH_TENSORRT=ON -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto} -DWITH_PYTHON=${WITH_PYTHON:-ON} -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF};build_error=$?
 
     # reset ccache zero stats for collect PR's actual hit rate
     ccache -z
@@ -2548,7 +2691,7 @@ EOF
     demo_ci_startTime_s=`date +%s`
     cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
     ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
-             ${WITH_TENSORRT:-ON} ${TENSORRT_ROOT_DIR:-/usr}
+             ${WITH_TENSORRT:-ON} ${TENSORRT_ROOT_DIR:-/usr} ${WITH_ONNXRUNTIME:-ON}
     DEMO_EXIT_CODE=$?
     ./clean.sh
     demo_ci_endTime_s=`date +%s`
@@ -2558,7 +2701,7 @@ EOF
     infer_ut_startTime_s=`date +%s`
     cd ${PADDLE_ROOT}/paddle/fluid/inference/tests/infer_ut
     ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
-             ${TENSORRT_ROOT_DIR:-/usr}
+             ${TENSORRT_ROOT_DIR:-/usr} ${WITH_ONNXRUNTIME:-ON}
     TEST_EXIT_CODE=$?
     infer_ut_endTime_s=`date +%s`
     echo "infer_ut tests Total time: $[ $infer_ut_endTime_s - $infer_ut_startTime_s ]s"
@@ -3009,6 +3152,11 @@ function main() {
         parallel_test
         check_coverage
         ;;
+      check_ipu_coverage)
+        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
+        parallel_test
+        check_coverage
+        ;;
       reuse_so_cicheck_py35)
         reuse_so_cache
         parallel_test
diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py
index 36ca048c51210..6fc6f7d3d494a 100644
--- a/python/paddle/autograd/backward_mode.py
+++ b/python/paddle/autograd/backward_mode.py
@@ -81,15 +81,14 @@ def check_tensors(in_out_list, name):
         if isinstance(in_out_list, (list, tuple)):
             assert len(in_out_list) > 0, "{} connot be empyt".format(name)
             for each_var in in_out_list:
-                assert isinstance(
-                    each_var, paddle.
-                    Tensor), "Elements of {} must be paddle.Tensor".format(name)
+                assert isinstance(each_var, (
+                    paddle.Tensor, core.eager.Tensor
+                )), "Elements of {} must be paddle.Tensor".format(name)
             return in_out_list
         else:
-            assert isinstance(
-                in_out_list,
-                paddle.Tensor), "{} must be Tensor or list of Tensor".format(
-                    name)
+            assert isinstance(in_out_list, (
+                paddle.Tensor, core.eager.Tensor
+            )), "{} must be Tensor or list of Tensor".format(name)
             return [in_out_list]
 
     tensors = check_tensors(tensors, "tensors")
@@ -105,10 +104,13 @@ def check_tensors(in_out_list, name):
         for each_tensor in grad_tensors:
             if each_tensor is not None:
                 assert isinstance(
-                    each_tensor, paddle.Tensor
+                    each_tensor, (paddle.Tensor, core.eager.Tensor)
                 ), "The argument 'grad_tensors' of paddle.autograd.backward is invalid, it can be 'None', 'paddle.Tensor' or 'list[None/paddle.Tensor]'."
     else:
-        grad_tensors = [None] * len(tensors)
+        if core._in_eager_mode():
+            grad_tensors = []
+        else:
+            grad_tensors = [None] * len(tensors)
 
     if len(grad_tensors) > 0:
         assert len(tensors) == len(
@@ -116,5 +118,8 @@ def check_tensors(in_out_list, name):
 
     assert isinstance(retain_graph, bool), "retain_graph must be True or False"
 
-    core.dygraph_run_backward(tensors, grad_tensors, retain_graph,
-                              framework._dygraph_tracer())
+    if core._in_eager_mode():
+        core.eager.run_backward(tensors, grad_tensors, retain_graph)
+    else:
+        core.dygraph_run_backward(tensors, grad_tensors, retain_graph,
+                                  framework._dygraph_tracer())
diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
index ae2d9163435b9..e303ce1216822 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -21,11 +21,12 @@
 
 from .utils import print_program_with_dist_attr
 from .operators import find_best_compatible_distributed_operator_impl
-from .dist_context import get_default_distributed_context
+from .dist_context import get_default_distributed_context, _node_id
 from .dist_tensor import DistributedTensor
 from .dist_op import DistributedOperator
 from .dist_attribute import TensorDistributedAttribute
 from .dist_attribute import OperatorDistributedAttribute
+from .process_mesh import ProcessMesh
 from paddle.distributed.fleet.meta_optimizers.common import OpRole
 
 
@@ -108,6 +109,20 @@ def compute_compatible_dims_mapping(dims_mapping_list):
     return compatible_result
 
 
+def merge_process_mesh_two(pm1, pm2):
+    process_set1 = set()
+    process_set2 = set()
+    if pm1 is None and pm2 is None:
+        return None
+    if pm1 is not None:
+        process_set1 = set(pm1.processes)
+    if pm2 is not None:
+        process_set2 = set(pm2.processes)
+    merged_process_set = process_set1.union(process_set2)
+    merged_process_mesh = ProcessMesh(list(merged_process_set))
+    return merged_process_mesh
+
+
 class Completer:
     def __init__(self, dist_context):
         assert dist_context is not None
@@ -119,7 +134,9 @@ def _update_tensor_node_dims_mapping(self, tensor_node, fwd=True):
             return False
         tensor_desc = tensor_node.var()
         # Skip reader tensor
-        if tensor_desc.type() == core.VarDesc.VarType.READER:
+        if tensor_desc.type() == core.VarDesc.VarType.READER \
+            or tensor_desc.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+            or tensor_desc.type == core.VarDesc.VarType.STEP_SCOPES:
             return False
         tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
             tensor_node)
@@ -185,7 +202,7 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
         op_dist_attr = dist_op.dist_attr
         if fwd:
             for tensor_node in op_node.inputs:
-                if tensor_node.var() is not None:
+                if tensor_node.is_var() and tensor_node.var() is not None:
                     if tensor_node.var().type() == core.VarDesc.VarType.READER:
                         continue
                     tensor_desc = tensor_node.var()
@@ -208,19 +225,19 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
             # Find the most compatible implemenetations from the distributed operator
             op_dist_impl = find_best_compatible_distributed_operator_impl(
                 dist_op, fwd=True)
-            assert op_dist_impl is not None, "Cannot find the dist op implementation."
-            dim_changed = op_dist_impl.update_dims_mapping(dist_op)
-            if dim_changed:
-                changed = True
-            if op_dist_impl.is_auto_compatible(dist_op):
-                if op_dist_impl.type == "elementwise":
-                    op_dist_attr.impl_type = "default"
-                else:
-                    op_dist_attr.impl_type = op_dist_impl.type
-                op_dist_attr.impl_idx = op_dist_impl.idx
+            if op_dist_impl is not None:
+                dim_changed = op_dist_impl.update_dims_mapping(dist_op)
+                if dim_changed:
+                    changed = True
+                if op_dist_impl.is_auto_compatible(dist_op):
+                    if op_dist_impl.type == "elementwise":
+                        op_dist_attr.impl_type = "default"
+                    else:
+                        op_dist_attr.impl_type = op_dist_impl.type
+                    op_dist_attr.impl_idx = op_dist_impl.idx
         else:
             for tensor_node in op_node.outputs:
-                if tensor_node.var() is not None:
+                if tensor_node.is_var() and tensor_node.var() is not None:
                     if tensor_node.var().type() == core.VarDesc.VarType.READER:
                         continue
                     tensor_desc = tensor_node.var()
@@ -243,61 +260,38 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
             # Find the most compatible implemenetations from the distributed operator
             op_dist_impl = find_best_compatible_distributed_operator_impl(
                 dist_op, fwd=False)
-            assert op_dist_impl is not None, "Cannot find the dist op implementation."
-            dim_changed = op_dist_impl.update_dims_mapping(dist_op)
-            if dim_changed:
-                changed = True
-            if op_dist_impl.is_auto_compatible(dist_op):
-                if op_dist_impl.type == "elementwise":
-                    op_dist_attr.impl_type = "default"
-                else:
-                    op_dist_attr.impl_type = op_dist_impl.type
-                op_dist_attr.impl_idx = op_dist_impl.idx
+            if op_dist_impl is not None:
+                dim_changed = op_dist_impl.update_dims_mapping(dist_op)
+                if dim_changed:
+                    changed = True
+                if op_dist_impl.is_auto_compatible(dist_op):
+                    if op_dist_impl.type == "elementwise":
+                        op_dist_attr.impl_type = "default"
+                    else:
+                        op_dist_attr.impl_type = op_dist_impl.type
+                    op_dist_attr.impl_idx = op_dist_impl.idx
         return changed
 
-    def _update_process_mesh(self):
-        def _find_nearset_node(nodes, idx):
-            for node in reversed(nodes[:idx]):
-                node_dist_attr = self._dist_context.get_dist_attr_for_graph(
-                    node)
-                if node_dist_attr.process_mesh is not None:
-                    return node
-
-        total_reach_fix_point = False
-        while not total_reach_fix_point:
-            total_changed = False
-            for is_fwd in [True, False]:
-                all_nodes = self._dist_context.serial_ordered_nodes \
-                    if is_fwd else reversed(self._dist_context.serial_ordered_nodes)
-                reach_fix_point = False
-                while not reach_fix_point:
-                    changed = False
-                    for idx, node in enumerate(all_nodes):
-                        nearest_node = _find_nearset_node(
-                            self._dist_context.serial_ordered_nodes, idx)
-                        if nearest_node is None:
-                            continue
-                        nearest_node_dis_attr = self._dist_context.get_dist_attr_for_graph(
-                            nearest_node)
-                        nearest_process_mesh = nearest_node_dis_attr.process_mesh
-                        cur_node_dist_attr = self._dist_context.get_dist_attr_for_graph(
-                            node)
-                        cur_process_mesh = cur_node_dist_attr.process_mesh
-                        compatible_process_mesh = compute_compatible_process_mesh(
-                            [cur_process_mesh, nearest_process_mesh])
-                        if compatible_process_mesh is not None \
-                            and cur_process_mesh != compatible_process_mesh:
-                            cur_node_dist_attr.process_mesh = compatible_process_mesh
-                            changed = True
-                    if changed:
-                        reach_fix_point = False
-                        total_changed = True
-                    else:
-                        reach_fix_point = True
-            if total_changed:
-                total_reach_fix_point = False
-            else:
-                total_reach_fix_point = True
+    def _update_dims_mapping_between_graphs(self):
+        changed = False
+        for parent_node, child_node in self._node_pairs_between_graphs:
+            parent_node_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                parent_node)
+            child_node_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                child_node)
+            parent_node_dims_mapping = parent_node_dist_attr.dims_mapping
+            child_node_dims_mapping = child_node_dist_attr.dims_mapping
+            compatible_dims_mapping = compute_compatible_dims_mapping(
+                [parent_node_dims_mapping, child_node_dims_mapping])
+            if (compatible_dims_mapping is not None) \
+                and (compatible_dims_mapping != parent_node_dims_mapping):
+                parent_node_dist_attr.dims_mapping = compatible_dims_mapping
+                changed = True
+            if (compatible_dims_mapping is not None) \
+                and (compatible_dims_mapping != child_node_dims_mapping):
+                parent_node_dist_attr.dims_mapping = compatible_dims_mapping
+                changed = True
+        return changed
 
     def _update_dims_mapping(self):
         # Complete dims_mapping for each node
@@ -318,11 +312,314 @@ def _update_dims_mapping(self):
                             node, fwd=is_fwd)
                         if op_changed:
                             changed = True
+                graph_changed = self._update_dims_mapping_between_graphs()
+                if graph_changed:
+                    changed = True
             if changed:
                 reach_fix_point = False
             else:
                 reach_fix_point = True
 
+    def _update_process_mesh_by_nearest(self, op_node, nearest_op_node):
+        op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
+        # Set the process mesh of the op node by its nearest op node
+        if not op_dist_attr.is_annotated("process_mesh"):
+            process_mesh = op_dist_attr.process_mesh
+            nearest_op_dis_attr = self._dist_context.get_dist_attr_for_graph(
+                nearest_op_node)
+            nearest_process_mesh = nearest_op_dis_attr.process_mesh
+            compatible_process_mesh = compute_compatible_process_mesh(
+                [process_mesh, nearest_process_mesh])
+            if compatible_process_mesh is not None \
+                and process_mesh != compatible_process_mesh:
+                op_dist_attr.process_mesh = compatible_process_mesh
+        # Skip the process_mesh setting of inputs and outputs of while_op
+        if op_dist_attr.op_type == "while":
+            return
+        # Set the process mesh of the op node's leaf-inputs
+        for tensor_node in op_node.inputs:
+            if tensor_node.is_var() and tensor_node.var() is not None:
+                tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
+                    tensor_node)
+                if tensor_dist_attr.is_annotated("process_mesh"):
+                    continue
+                # Skip the non-leaf var node
+                if len(tensor_node.inputs) != 0:
+                    continue
+                compatible_process_mesh = compute_compatible_process_mesh(
+                    [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh])
+                if compatible_process_mesh is not None \
+                    and tensor_dist_attr.process_mesh != compatible_process_mesh:
+                    tensor_dist_attr.process_mesh = compatible_process_mesh
+        # Set the process mesh of the op node's outputs
+        for tensor_node in op_node.outputs:
+            if tensor_node.is_var() and tensor_node.var() is not None:
+                tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
+                    tensor_node)
+                if tensor_dist_attr.is_annotated("process_mesh"):
+                    continue
+                compatible_process_mesh = compute_compatible_process_mesh(
+                    [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh])
+                if compatible_process_mesh is not None \
+                    and tensor_dist_attr.process_mesh != compatible_process_mesh:
+                    tensor_dist_attr.process_mesh = compatible_process_mesh
+
+    def _update_process_mesh_for_specials(self):
+        def _find_nearest_tensor_node_before(nodes, idx, var_name):
+            for node in reversed(nodes[:idx]):
+                if node.is_var() and node.var() is not None \
+                    and node.var().name() == var_name:
+                    return node
+
+        def _find_nearest_tensor_node_after(nodes, idx, var_name):
+            for node in nodes[idx + 1:]:
+                if node.is_var() and node.var() is not None \
+                    and node.var().name() == var_name:
+                    return node
+
+        def _find_nodes_related_to_cond(source_node):
+            related_nodes = []
+            visited = set()
+            frontier = list()
+            frontier.append(source_node)
+            # BFS
+            while len(frontier) != 0:
+                cur = frontier[0]
+                frontier = frontier[1:]
+                if _node_id(cur) in visited:
+                    continue
+                # TODO: need more restrictions
+                for node in cur.inputs:
+                    if node.is_var() and node.var() is not None:
+                        if node.var().type() != core.VarDesc.VarType.READER \
+                            and len(node.var().shape()) == 1:
+                            frontier.append(node)
+                            related_nodes.append(node)
+                    if node.is_op() and node.op() is not None:
+                        flag = True
+                        if node.op().type() == "create_py_reader" \
+                            or node.op().type() == "create_double_buffer_reader" \
+                            or node.op().type() == "read":
+                            flag = False
+                        for tensor_node in node.inputs:
+                            if tensor_node.is_var() and tensor_node.var(
+                            ) is not None:
+                                if tensor_node.var().type() == core.VarDesc.VarType.READER \
+                                    or len(tensor_node.var().shape()) != 1:
+                                    flag = False
+                                    break
+                        for tensor_node in node.outputs:
+                            if tensor_node.is_var() and tensor_node.var(
+                            ) is not None:
+                                if tensor_node.var().type() == core.VarDesc.VarType.READER \
+                                    or len(tensor_node.var().shape()) != 1:
+                                    flag = False
+                                    break
+                        if flag:
+                            frontier.append(node)
+                            related_nodes.append(node)
+                visited.add(_node_id(cur))
+            return related_nodes
+
+        # Amend the process meshes related to while_op
+        for while_op_node, while_op_node_idx in self._while_op_nodes.values():
+            sub_graph_id = while_op_node.op()._block_attr_id("sub_block")
+            sub_graph = self._dist_context._serial_graph.get_sub_graph(
+                sub_graph_id)
+            sub_graph_nodes = list(sub_graph.all_nodes())
+            while_dist_op = self._dist_context.get_dist_op_for_graph(
+                while_op_node)
+            while_op_dist_attr = while_dist_op.dist_attr
+
+            # Step 1: set the process mesh of while_op to the merged process mesh of its subblock
+            merged_process_mesh = while_op_dist_attr.process_mesh
+            for node in sub_graph_nodes:
+                if (node.is_var() and node.var() is not None) \
+                    or (node.is_op() and node.op() is not None):
+                    dist_attr = self._dist_context.get_dist_attr_for_graph(node)
+                    merged_process_mesh = merge_process_mesh_two(
+                        merged_process_mesh, dist_attr.process_mesh)
+            while_op_dist_attr.process_mesh = merged_process_mesh
+
+            # Step 2: set the related nodes of while_op to the process mesh of while_op
+            # Step 2.1: Find related nodes of cond var the graph of while_op
+            cond_tensor_related_nodes = []
+            cond_tensor_name = while_op_node.op().input("Condition")[0]
+            cond_tensor_node = None
+            for node in while_op_node.inputs:
+                if node.is_var() and node.var() is not None \
+                    and node.var().name() == cond_tensor_name:
+                    cond_tensor_node = node
+                    cond_tensor_related_nodes.append(cond_tensor_node)
+                    break
+
+            cond_tensor_related_nodes.extend(
+                _find_nodes_related_to_cond(cond_tensor_node))
+
+            # Step 2.2: Find related nodes of cond var in the subgraph of while_op
+            cond_tensor_node = None
+            for node in reversed(sub_graph_nodes):
+                if node.is_var() and node.var() is not None \
+                    and node.var().name() == cond_tensor_name \
+                        and len(node.outputs) == 0:
+                    cond_tensor_node = node
+                    break
+
+            cond_tensor_related_nodes.extend(
+                _find_nodes_related_to_cond(cond_tensor_node))
+            # Step 2.3: Add the StepScops output of while_op
+            stepscopes_tensor_name = while_op_node.op().output("StepScopes")[0]
+            stepscopes_tensor_node = None
+            for output_node in while_op_node.outputs:
+                if output_node.is_var() and output_node.var() is not None \
+                    and output_node.var().name() == stepscopes_tensor_name:
+                    stepscopes_tensor_node = output_node
+            cond_tensor_related_nodes.append(stepscopes_tensor_node)
+            # Step 2.4: Set the process meshes of all nodes related to cond var to the process mesh of while op
+            for node in cond_tensor_related_nodes:
+                tensor_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    node)
+                tensor_dist_attr.process_mesh = merged_process_mesh
+
+            # Step 3: set the process meshes of the inputs in while_op to the process meshes of the outside input nodes
+            while_op_inputs_dist_attrs = while_op_dist_attr.inputs_dist_attrs
+            for tensor_name, tensor_dist_attr in while_op_inputs_dist_attrs.items(
+            ):
+                nearest_tensor_node = _find_nearest_tensor_node_before(
+                    self._dist_context.serial_ordered_nodes, while_op_node_idx,
+                    tensor_name)
+                nearest_tensor_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    nearest_tensor_node)
+                tensor_dist_attr.process_mesh = nearest_tensor_dist_attr.process_mesh
+
+            # Step 4: set the process meshes of the outputs in while_op to the process meshes of the outside output nodes
+            while_op_outputs_dist_attrs = while_op_dist_attr.outputs_dist_attrs
+            for tensor_name, tensor_dist_attr in while_op_outputs_dist_attrs.items(
+            ):
+                nearest_tensor_node = _find_nearest_tensor_node_before(
+                    self._dist_context.serial_ordered_nodes, while_op_node_idx,
+                    tensor_name)
+                if nearest_tensor_node is None:
+                    nearest_tensor_node = _find_nearest_tensor_node_after(
+                        self._dist_context.serial_ordered_nodes,
+                        while_op_node_idx, tensor_name)
+                nearest_tensor_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    nearest_tensor_node)
+                tensor_dist_attr.process_mesh = nearest_tensor_dist_attr.process_mesh
+
+        # Amend the process meshes related to array
+        for array_node_list in self._array_nodes.values():
+            merged_process_mesh = None
+            for array_node in array_node_list:
+                dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    array_node)
+                merged_process_mesh = merge_process_mesh_two(
+                    merged_process_mesh, dist_attr.process_mesh)
+            for array_node in array_node_list:
+                dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    array_node)
+                dist_attr.process_mesh = merged_process_mesh
+
+    def _update_process_mesh(self):
+        ordered_op_nodes = self._dist_context._serial_ordered_op_nodes
+
+        # Step 1: Set the annotated process meshes from tensors to the first ops using them
+        ordered_tensor_nodes = self._dist_context._serial_ordered_tensor_nodes
+        for tensor_node in ordered_tensor_nodes:
+            tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
+                tensor_node)
+            if not tensor_dist_attr.is_annotated("process_mesh"):
+                continue
+            first_op_node = None
+            for op_node in ordered_op_nodes:
+                # TODO: Need a better rule for the control flow ops.
+                # For now, do not set the process mesh of while_op from its inputs
+                if op_node.op().type() == "while":
+                    continue
+                for input_tensor_node in op_node.inputs:
+                    if _node_id(tensor_node) == _node_id(input_tensor_node):
+                        first_op_node = op_node
+                        break
+                if first_op_node is not None:
+                    break
+            if first_op_node is None:
+                continue
+            op_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                first_op_node)
+            if op_dist_attr is not None and not op_dist_attr.is_annotated(
+                    "process_mesh"):
+                compatible_process_mesh = compute_compatible_process_mesh(
+                    [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh])
+                if compatible_process_mesh is not None \
+                    and op_dist_attr.process_mesh != compatible_process_mesh:
+                    op_dist_attr.process_mesh = compatible_process_mesh
+
+        # Step 2: set the process meshes of ops with the nearest op before them
+        # Step 2.1: find the first op node which has the process mesh
+        idx_of_first_op_node_has_process_mesh = -1
+        for idx, op_node in enumerate(ordered_op_nodes):
+            op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
+            if op_dist_attr.process_mesh is not None \
+                and idx_of_first_op_node_has_process_mesh == -1:
+                idx_of_first_op_node_has_process_mesh = idx
+                # Reuse the following method to set the related tensors for same op node
+                self._update_process_mesh_by_nearest(op_node, op_node)
+        # Step 2.2: set the process meshes of ops by the nearest op node after the first op node
+        if idx_of_first_op_node_has_process_mesh + 1 > len(ordered_op_nodes):
+            return None
+        for idx, op_node in enumerate(ordered_op_nodes[
+                idx_of_first_op_node_has_process_mesh + 1:]):
+            original_idx = idx_of_first_op_node_has_process_mesh + +idx + 1
+            nearest_op_node = ordered_op_nodes[original_idx - 1]
+            nearest_op_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                nearest_op_node)
+            op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
+            assert nearest_op_dist_attr.process_mesh is not None
+            self._update_process_mesh_by_nearest(op_node, nearest_op_node)
+        # Step 2.3: set the process meshes of ops by the nearest op node before the first op node
+        nearest_op_node = ordered_op_nodes[
+            idx_of_first_op_node_has_process_mesh]
+        for op_node in ordered_op_nodes[:idx_of_first_op_node_has_process_mesh]:
+            self._update_process_mesh_by_nearest(op_node, nearest_op_node)
+
+        # Step 3: adjust the process meshes for special ops
+        self._update_process_mesh_for_specials()
+
+    def _prepare(self):
+        self._while_op_nodes = {}
+        self._array_nodes = {}
+        self._node_pairs_between_graphs = []
+        all_nodes = self._dist_context.serial_ordered_nodes
+        for idx, node in enumerate(all_nodes):
+            if node.is_op():
+                if node.op().type() == "while":
+                    self._while_op_nodes[_node_id(node)] = (node, idx)
+                if node.op().type() == "read_from_array":
+                    array_var_name = node.op().input("X")[0]
+                    if self._array_nodes.get(array_var_name, None) is None:
+                        self._array_nodes[array_var_name] = []
+                    self._array_nodes[array_var_name].append(node)
+                if node.op().type() == "write_to_array":
+                    array_var_name = node.op().output("Out")[0]
+                    if self._array_nodes.get(array_var_name, None) is None:
+                        self._array_nodes[array_var_name] = []
+                    self._array_nodes[array_var_name].append(node)
+                    self._array_nodes[array_var_name].append(node.outputs[0])
+            if node.is_var() and node.var() is not None:
+                if node.node.graph_id() != 0:
+                    for before_node in reversed(all_nodes[:idx]):
+                        if before_node.is_var() and before_node.var() is not None \
+                            and before_node.node.graph_id() == node.node.graph_id() - 1 \
+                                and before_node.var().name() == node.var().name():
+                            self._node_pairs_between_graphs.append(
+                                (before_node, node))
+                    for after_node in all_nodes[idx + 1:]:
+                        if after_node.is_var() and after_node.var() is not None \
+                            and after_node.node.graph_id() == node.node.graph_id() - 1 \
+                                and after_node.var().name() == node.var().name():
+                            self._node_pairs_between_graphs.append(
+                                (after_node, node))
+
     def complete_forward_annotation(self, serial_main_program):
         """ Complete annotation for the partial annotated serial_main_program.
         Arguments:
@@ -336,24 +633,24 @@ def complete_forward_annotation(self, serial_main_program):
 
         # Initialize distributed attributes for all var and op node in serial_main_program
         self._dist_context.init_dist_attr_for_program()
+        # print_program_with_dist_attr(serial_main_program, self._dist_context)
 
         # Initialize distributed attributes for all var and op node in graph
         self._dist_context.init_dist_attr_for_graph()
 
+        self._prepare()
+
         self._update_process_mesh()
 
-        # Complete dims_mapping for each node
         self._update_dims_mapping()
 
         # Copy the corresponding distributed attribute from graph to serial_main_program
         self._dist_context.copy_dist_attr_from_graph_to_program()
         self._dist_context.clear_dist_info_for_graph()
 
-        # print_serial_main_program_with_dist_attr(serial_main_program, self._dist_context)
         # Do the validation check and amend some completion
         self._dist_context.amend_dist_attr_for_program()
 
-        # print_serial_main_program_with_dist_attr(serial_main_program, self._dist_context)
         self._dist_context.validate_dist_attr_for_program()
 
         return serial_main_program
diff --git a/python/paddle/distributed/auto_parallel/converter.py b/python/paddle/distributed/auto_parallel/converter.py
new file mode 100644
index 0000000000000..d88f9fe7501b5
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/converter.py
@@ -0,0 +1,455 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import warnings
+import logging
+import numpy as np
+from ..utils import get_logger
+
+
+class Converter(object):
+    """
+    Converter is a class object for auto parallel to convert tensors from 
+    one parallel strategy to another one. Tensors will merge and slice value 
+    with their strategy when strategies are different.
+    """
+
+    def __init__(self, tensors_dict, pre_strategy, cur_strategy):
+        """
+        Args:
+            tensors_dict(dict): tensors' value of all ranks that to be converted. 
+                key is tensor's name(str), value is all ranks' data(list(numpy.ndarray))
+            pre_strategy(dict): tensors' distributed attribute of last training process.
+                key is tensor's name(str), value is tensor's distributed attribute in last 
+                training process.
+            cur_strategy(dict): tensors' distributed attribute of current rank.
+                key is tensor's name(str), value is tensor's distributed attribute in current
+                rank.
+        """
+        self._tensors_dict = self._check_tensor_dict(tensors_dict)
+        self._pre_strategy = self._check_pre_strategy(pre_strategy)
+        self._cur_strategy = self._check_cur_strategy(cur_strategy)
+        self._logger = get_logger(logging.INFO)
+
+    def _check_tensor_dict(self, tensors_dict):
+        if not tensors_dict:
+            raise ValueError("'tensors_dict' is None, "
+                             "the tensors to be converted cannot be None.")
+        if not isinstance(tensors_dict, dict):
+            raise TypeError(
+                "The type of 'tensors_dict' should be 'dict', but got '{}'.".
+                format(str(type(tensors_dict))))
+        return tensors_dict
+
+    def _check_pre_strategy(self, pre_strategy):
+        if not pre_strategy:
+            raise ValueError("'pre_strategy' is None, "
+                             "there are not tensors in pre process.")
+        if not isinstance(pre_strategy, dict):
+            raise TypeError("The type of 'pre_strategy' should be 'dict', "
+                            "but got '{}'.".format(str(type(pre_strategy))))
+        return pre_strategy
+
+    def _check_cur_strategy(self, cur_strategy):
+        if not cur_strategy:
+            warnings.warn("'cur_strategy' is None, "
+                          "there are not tensors in cur process")
+        if not isinstance(cur_strategy, dict):
+            raise TypeError("The type of 'cur_strategy' should be 'dict', "
+                            "but got '{}'.".format(str(type(cur_strategy))))
+        return cur_strategy
+
+    def convert(self, strict=True):
+        """
+        Convert tensors
+
+        Args:
+            strict(bool): whether to strict convert tensor with tensor's name. If False, it will
+            convert tensors by prefix matching. Otherwise, tensors will be converted with
+            their name strictly.
+
+        Returns:
+            converted tensors(dict)
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                complete_tensors = np.arange(4).reshape([2, 2])
+                partitial_tensors = np.split(complete_tensors, 2, axis=0)
+                name = "tmp_0"
+                tensors_dict = {name: partitial_tensors}
+                strategy_1 = {
+                    name: {
+                        "process_shape": [2],
+                        "process_group": [0, 1],
+                        "dims_mapping": [0, -1]
+                    }
+                }
+                strategy_2 = {
+                    name: {
+                        "process_shape": [2],
+                        "process_group": [0, 1],
+                        "dims_mapping": [-1, -1]
+                    }
+                }
+                converter = Converter(tensors_dict, strategy_1, strategy_2)
+                result = converter.convert()
+                # the result's value is equal to `complete_tensors`
+        """
+        tensors_dict = {}
+        # the name which is in cur_process but not in pre_process
+        tensor_not_in_pre = []
+        # the name which is in pre_process but not in cur_process
+        tensor_not_in_cur = []
+        # the name which is in strategy but not in ckpt files
+        tensor_not_in_ckpt = []
+        self._logger.info("Start to convert tensors.")
+        for tensor_name in self._cur_strategy:
+            if tensor_name not in self._pre_strategy:
+                tensor_not_in_pre.append(tensor_name)
+                continue
+            if tensor_name not in self._tensors_dict:
+                tensor_not_in_ckpt.append(tensor_name)
+                continue
+            self._pre_name = tensor_name
+            self._cur_name = tensor_name
+            tensor_list = self._tensors_dict[tensor_name]
+            pre_dist_attr = self._pre_strategy[tensor_name]
+            cur_dist_attr = self._cur_strategy[tensor_name]
+            try:
+                tensors_dict[tensor_name] = Converter.merge_and_slice(
+                    tensor_list, pre_dist_attr, cur_dist_attr)
+            except ValueError as err:
+                raise ValueError("Fail to convert tensor '{}'. "
+                                 .format(str(tensor_name)) + str(err))
+
+        for tensor_name in self._pre_strategy:
+            if tensor_name not in self._cur_strategy:
+                tensor_not_in_cur.append(tensor_name)
+
+        if not strict:
+            tensors_dict, tensor_match_with_pre, tensor_match_with_cur = self.convert_with_prefix_match(
+                tensors_dict, tensor_not_in_pre, tensor_not_in_cur)
+        else:
+            tensors_dict, tensor_match_with_pre, tensor_match_with_cur = tensors_dict, [], []
+
+        tensor_not_in_pre = set(tensor_not_in_pre) - set(tensor_match_with_pre)
+        tensor_not_in_cur = set(tensor_not_in_cur) - set(tensor_match_with_cur)
+        if tensor_not_in_pre:
+            warnings.warn(
+                "tensors [{}] are not found in last training strategy."
+                .format(str(tensor_not_in_pre)))
+        if tensor_not_in_cur:
+            warnings.warn(
+                "tensors [{}] are not found in current training strategy."
+                .format(str(tensor_not_in_cur)))
+        if tensor_not_in_ckpt:
+            warnings.warn(
+                "tensors [{}] are found in pre_strategy, but are not found"
+                "in checkpoint files, please check your checkpoint files."
+                .format(str(tensor_not_in_ckpt)))
+
+        return tensors_dict
+
+    def convert_with_prefix_match(self, tensors_dict, tensor_not_in_pre,
+                                  tensor_not_in_cur):
+        # the name which in cur_process and can match with pre_process
+        tensor_match_with_pre = []
+        # the name which in pre_process and can match with cur_process
+        tensor_match_with_cur = []
+        for cur_name in tensor_not_in_pre:
+            prefix_name = cur_name
+            while prefix_name.find("_") != -1:
+                prefix_name = prefix_name[:prefix_name.rfind("_")]
+                for pre_name in tensor_not_in_cur:
+                    if prefix_name in pre_name:
+                        # 'cur_name' of cur_process can match with 'pre_name' of pre_process
+                        self._pre_name = pre_name
+                        self._cur_name = cur_name
+                        pre_tensor_list = self._tensors_dict[pre_name]
+                        pre_dist_attr = self._pre_strategy[pre_name]
+                        cur_dist_attr = self._cur_strategy[cur_name]
+                        try:
+                            tensors_dict[cur_name] = Converter.merge_and_slice(
+                                pre_tensor_list, pre_dist_attr, cur_dist_attr)
+                        except ValueError as err:
+                            raise ValueError(
+                                "Fail to convert tensor '{}' by '{}'. ".format(
+                                    str(cur_name), str(pre_name)) + str(err))
+                        self._logger.info(
+                            "tensor [{}] is matched with tensor [{}]".format(
+                                cur_name, pre_name))
+                        tensor_match_with_pre.append(cur_name)
+                        tensor_match_with_cur.append(pre_name)
+                        break
+                break
+
+        return tensors_dict, tensor_match_with_pre, tensor_match_with_cur
+
+    @staticmethod
+    def merge_and_slice(tensor_list, pre_dist_attr, cur_dist_attr):
+        """
+        Merge tensors with previous dist_attr and slice tensors with current dist_attr
+
+        Returns:
+            tensor(numpy.narray): a tensor's value of current rank.
+        """
+        assert isinstance(tensor_list, list)
+        assert all(isinstance(p, np.ndarray) for p in tensor_list)
+
+        if pre_dist_attr == cur_dist_attr:
+            # skip merge and slice tensor
+            rank_id = paddle.distributed.get_rank()
+            index = cur_dist_attr["process_group"].index(rank_id)
+            tensor = tensor_list[index]
+        else:
+            pre_dims_mapping = pre_dist_attr["dims_mapping"]
+            cur_dims_mapping = cur_dist_attr["dims_mapping"]
+            if len(set(pre_dims_mapping)) > 1 or -1 not in pre_dims_mapping:
+                # merge tensor
+                tensor = Converter.merge_with_dist_attr(tensor_list,
+                                                        pre_dist_attr)
+            else:
+                # skip merge tensor
+                tensor = tensor_list[0]
+
+            if len(set(cur_dims_mapping)) > 1 or -1 not in cur_dims_mapping:
+                # slice tensor
+                tensor = Converter.slice_with_dist_attr(tensor, cur_dist_attr)
+
+        return tensor
+
+    @staticmethod
+    def merge_with_dist_attr(tensor_list, dist_attr):
+        """ Merge tensor with distributed attribute """
+        from .reshard import _compute_complete_shape, _compute_partition_index
+
+        dims_mapping = dist_attr["dims_mapping"]
+        process_shape = dist_attr["process_shape"]
+        process_group = dist_attr["process_group"]
+        # get the complete shape of the tensor
+        complete_shape = _compute_complete_shape(tensor_list[0].shape,
+                                                 process_shape, dims_mapping)
+        # merge the tensor with dist_attr
+        partition_tensor_list = []
+        merged_partiton = []
+        for process in process_group:
+            partition_index = _compute_partition_index(
+                process, complete_shape, dims_mapping, process_shape,
+                process_group)
+            index = process_group.index(process)
+            if partition_index not in merged_partiton:
+                merged_partiton.append(partition_index)
+                Converter.merge(partition_tensor_list, tensor_list[index],
+                                partition_index, complete_shape)
+
+        if len(partition_tensor_list) != 1:
+            raise ValueError("Fail to merge tensor with dist_attr '{}'.".format(
+                str(dist_attr)))
+        complete_tensor = partition_tensor_list[0][0]
+        return complete_tensor
+
+    @staticmethod
+    def slice_with_dist_attr(tensor, dist_attr):
+        """ Slice tensor with distributed attribute """
+        dims_mapping = dist_attr["dims_mapping"]
+        process_shape = dist_attr["process_shape"]
+        process_group = dist_attr["process_group"]
+        # slice the tensor with dist_attr
+        partition_index_list = Converter._get_split_indices(
+            tensor.shape, dims_mapping, process_shape, process_group)
+        sliced_tensor_list = Converter.split(tensor, partition_index_list,
+                                             len(partition_index_list))
+        # get the current tensor's index in sliced_tensor_list
+        rank_id = paddle.distributed.get_rank()
+        sliced_tensor_index = Converter._get_sliced_index(
+            rank_id, tensor.shape, dims_mapping, process_shape, process_group)
+        if sliced_tensor_index not in range(len(sliced_tensor_list)):
+            raise ValueError("Fail to slice tensor with dist_attr '{}'.".format(
+                str(dist_attr)))
+        sliced_tensor = sliced_tensor_list[sliced_tensor_index]
+        return sliced_tensor
+
+    @staticmethod
+    def merge(partition_tensor_list, tensor, partition_index, complete_shape):
+        """
+        Merge partitial tensors to a complete.
+
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                partition_tensor_list = [(np.array([[[1.11, 1.12]]]), [[0,1],[0,1],[0,2]])]
+                tensor = np.array([[[1.13, 1.14]]])
+                partition_index = [[0,1],[0,1],[2,4]]
+
+                _merge_tensor(partition_tensor_list, tensor, partition_index)
+                # partition_tensor_list: [(np.array([[[1.11, 1.12, 1.13, 1.14]]]), [[0,1],[0,1],[0,4]])]
+        """
+        from .reshard import _compute_concat_info
+
+        if len(partition_tensor_list) == 1:
+            is_complete_data = True
+            for idx, item in enumerate(partition_tensor_list[0][1]):
+                if item[0] != 0 or item[1] != complete_shape[idx]:
+                    is_complete_data = False
+                    break
+            if is_complete_data:
+                return
+
+        if not partition_tensor_list:
+            partition_tensor_list.append((tensor, partition_index))
+        else:
+            i = 0
+            while i < len(partition_tensor_list):
+                concat_axis, first_order, new_partition = _compute_concat_info(
+                    partition_tensor_list[i][1], partition_index)
+                if concat_axis != -1:
+                    if first_order == 0:
+                        new_tensor = np.concatenate(
+                            (partition_tensor_list[i][0], tensor),
+                            axis=concat_axis)
+                    else:
+                        new_tensor = np.concatenate(
+                            (tensor, partition_tensor_list[i][0]),
+                            axis=concat_axis)
+
+                    partition_tensor_list.pop(i)
+                    Converter.merge(partition_tensor_list, new_tensor,
+                                    new_partition, complete_shape)
+                    break
+                i += 1
+
+    @staticmethod
+    def split(complete_tensor, partition_index_list, length):
+        """
+        Slice a complete tensor.
+
+        Returns:
+            sliced_tensor_list(list): sliced tensors with 'partition_index_list'
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                complete_tensor = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]])
+                rank = 2
+                complete_shape = [1, 1, 6]
+                dims_mapping = [-1, -1, 0]
+                process_shape = [3]
+                process_group = [0, 1, 2]
+
+                sliced_tensor_list = split(complete_tensor, [[], [], [2, 4]], 3)
+                # [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])]
+        """
+        sliced_tensor_list = []
+        axis = len(complete_tensor.shape) - length
+        sliced_tensor = np.split(
+            complete_tensor, partition_index_list[axis], axis=axis)
+        if length == 1:
+            return sliced_tensor
+        for tensor in sliced_tensor:
+            sliced_tensor_list.extend(
+                Converter.split(tensor, partition_index_list, length - 1))
+        return sliced_tensor_list
+
+    @staticmethod
+    def _get_split_indices(complete_shape, dims_mapping, process_shape,
+                           process_group):
+        """
+        Get split indices of every dimension.
+
+        Returns:
+            split_indices_list(list): the split indices of every dimension of the tensor
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                complete_tensor = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]])
+                complete_shape = [1, 1, 6]
+                dims_mapping = [-1, -1, 0]
+                process_shape = [3]
+                process_group = [0, 1, 2]
+
+                index = _get_split_indices(complete_shape, dims_mapping, process_shape, process_group)
+                # index: [[], [], [2, 4]]
+        """
+        from .reshard import _compute_partition_index
+
+        split_indices_list = []
+        for process in process_group:
+            partition_index = _compute_partition_index(
+                process, complete_shape, dims_mapping, process_shape,
+                process_group)
+            if split_indices_list:
+                for dim in range(len(partition_index)):
+                    split_indices_list[dim].extend(partition_index[dim])
+            else:
+                split_indices_list = partition_index
+        split_indices_list = list(
+            map(lambda x, y: list(set(x) - set([y]) - set([0])),
+                split_indices_list, complete_shape))
+        split_indices_list = [sorted(x) for x in split_indices_list]
+        return split_indices_list
+
+    @staticmethod
+    def _get_sliced_index(rank_id, complete_shape, dims_mapping, process_shape,
+                          process_group):
+        """
+        Get sliced_tensor's index of current rank in all sliced tensors list.
+
+        Returns:
+            sliced_tensor_index(int): the index of sliced tensor in sliced_tensor_list
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                complete_tensor = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]])
+                rank = 2
+                complete_shape = [1, 1, 6]
+                dims_mapping = [-1, -1, 0]
+                process_shape = [3]
+                process_group = [0, 1, 2]
+
+                slice_tensor = _slice_tensor(complete_tensor, [[], [], [2, 4]], 3)
+                # slice_tensor: 
+                # [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])]
+
+                index = _get_sliced_index(rank, complete_shape, dims_mapping
+                                                process_shape, process_group)
+                # index: 2
+        """
+        from .reshard import _compute_partition_index
+
+        partition_index = _compute_partition_index(
+            rank_id, complete_shape, dims_mapping, process_shape, process_group)
+        sliced_index = 0
+        for i, shape in enumerate(complete_shape):
+            if dims_mapping[i] == -1:
+                slice_shape = shape
+            else:
+                slice_shape = shape // process_shape[dims_mapping[i]]
+            if shape == 1:
+                index = 0
+            else:
+                index = (partition_index[i][0] + 1) // slice_shape
+            sliced_index = sliced_index * (shape // slice_shape) + index
+        return sliced_index
diff --git a/python/paddle/distributed/auto_parallel/dist_attribute.py b/python/paddle/distributed/auto_parallel/dist_attribute.py
index b27cd7a37c956..8ec702ffcb0b6 100644
--- a/python/paddle/distributed/auto_parallel/dist_attribute.py
+++ b/python/paddle/distributed/auto_parallel/dist_attribute.py
@@ -175,6 +175,7 @@ def __str__(self):
 class OperatorDistributedAttribute:
     def __init__(self):
         self._process_mesh = None
+        self._op_type = None
         self._impl_type = None
         self._impl_idx = None
         self._inputs_dist_attrs = {}
@@ -194,11 +195,23 @@ def process_mesh(self, process_mesh):
             if isinstance(process_mesh, list):
                 process_mesh = ProcessMesh(process_mesh)
             self._process_mesh = copy.deepcopy(process_mesh)
+            # In while op, the proess mesh is not shared by all inputs and outputs 
+            if self._op_type == "while":
+                return None
             for dist_attr in self._inputs_dist_attrs.values():
                 dist_attr.process_mesh = process_mesh
             for dist_attr in self._outputs_dist_attrs.values():
                 dist_attr.process_mesh = process_mesh
 
+    @property
+    def op_type(self):
+        return self._op_type
+
+    @op_type.setter
+    def op_type(self, op_type):
+        if op_type is not None:
+            self._op_type = op_type
+
     @property
     def impl_type(self):
         return self._impl_type
@@ -326,6 +339,8 @@ def init(self, dist_attr):
                     assert False, "No setter for {} in args {}.".format(
                         key, dist_attr)
         # Make sure proscess_meshes in dist op be same
+        if self.op_type == "while":
+            return None
         process_meshes = []
         process_meshes.append(self.process_mesh)
         for tensor_dist_attr in self.inputs_dist_attrs.values():
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index 573f23fdca519..2807c46540ab1 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -15,6 +15,7 @@
 import copy
 from collections import defaultdict
 from paddle.fluid import framework
+from paddle.fluid.framework import get_flags, set_flags
 from paddle.fluid import core
 from .dist_attribute import TensorDistributedAttribute
 from .dist_attribute import OperatorDistributedAttribute
@@ -39,6 +40,10 @@ def set_default_distributed_context(dist_context):
     _g_default_distributed_context = dist_context
 
 
+def _node_id(node):
+    return (node.node.graph_id(), node.node.id())
+
+
 class DistributedContext:
     """
     DistributedContext is used to collect related distributed information for program and graph.
@@ -146,7 +151,7 @@ def get_dist_tensor_for_program(self, serial_tensor):
                 return None
 
     def get_dist_tensor_for_graph(self, serial_tensor_node):
-        serial_tensor_node_id = serial_tensor_node.id()
+        serial_tensor_node_id = _node_id(serial_tensor_node)
         return self._dist_tensors_for_graph.get(serial_tensor_node_id, None)
 
     def get_dist_op_for_program(self, serial_op):
@@ -168,7 +173,7 @@ def del_dist_op_for_program(self, serial_tensor):
             del self._dist_ops_for_program[serial_tensor_id]
 
     def get_dist_op_for_graph(self, serial_op_node):
-        serial_op_node_id = serial_op_node.id()
+        serial_op_node_id = _node_id(serial_op_node)
         return self._dist_ops_for_graph.get(serial_op_node_id, None)
 
     def get_tensor_dist_attr_for_program(self, serial_tensor):
@@ -197,7 +202,7 @@ def set_tensor_dist_attr_for_program(self, serial_tensor, dist_attr):
         self.add_dist_tensor_for_program(dist_tensor)
 
     def get_tensor_dist_attr_for_graph(self, serial_tensor_node):
-        serial_tensor_node_id = serial_tensor_node.id()
+        serial_tensor_node_id = _node_id(serial_tensor_node)
         dist_tensor = self._dist_tensors_for_graph.get(serial_tensor_node_id,
                                                        None)
         if dist_tensor:
@@ -242,7 +247,7 @@ def set_op_dist_attr_for_program(self, serial_op, dist_attr):
         self.add_dist_op_for_program(dist_op)
 
     def get_op_dist_attr_for_graph(self, serial_op_node):
-        serial_op_node_id = serial_op_node.id()
+        serial_op_node_id = _node_id(serial_op_node)
         dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None)
         if dist_op:
             return dist_op.dist_attr
@@ -262,7 +267,7 @@ def get_op_dist_attr_for_graph(self, serial_op_node):
 
     def get_dist_attr_for_graph(self, serial_node):
         if serial_node.is_var() and serial_node.var() is not None:
-            serial_tensor_node_id = serial_node.id()
+            serial_tensor_node_id = _node_id(serial_node)
             dist_tensor = self._dist_tensors_for_graph.get(
                 serial_tensor_node_id, None)
             if dist_tensor:
@@ -270,7 +275,7 @@ def get_dist_attr_for_graph(self, serial_node):
             else:
                 return None
         if serial_node.is_op() and serial_node.op() is not None:
-            serial_op_node_id = serial_node.id()
+            serial_op_node_id = _node_id(serial_node)
             dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None)
             if dist_op:
                 return dist_op.dist_attr
@@ -311,40 +316,69 @@ def init_dist_attr_for_program(self):
     def order_nodes_by_program_order(self):
         def _contains(nodes, target_node):
             for node in nodes:
-                if node.id() == target_node.id():
+                if _node_id(node) == _node_id(target_node):
                     return True
             return False
 
-        ordered_tensor_nodes = []
-        ordered_op_nodes = []
-        all_nodes = self._serial_graph.all_nodes()
+        serial_ordered_tensor_nodes = []
+        serial_ordered_op_nodes = []
+        all_nodes = []
+        # for idx, graph in enumerate(self._serial_graph.all_sub_graphs()):
+        for idx, graph in enumerate(self._serial_graph.all_sub_graphs()):
+            for node in graph.all_nodes():
+                all_nodes.append(node)
         for node in all_nodes:
             if node.is_var() and node.var() is not None:
-                ordered_tensor_nodes.append(node)
+                serial_ordered_tensor_nodes.append(node)
             if node.is_op() and node.op() is not None:
-                ordered_op_nodes.append(node)
-        ordered_tensor_nodes.sort(key=lambda node: node.node.original_desc_id())
-        ordered_op_nodes.sort(key=lambda node: node.node.original_desc_id())
-        for op_node in ordered_op_nodes:
+                serial_ordered_op_nodes.append(node)
+        serial_ordered_tensor_nodes.sort(
+            key=lambda node: node.node.original_desc_id())
+        serial_ordered_op_nodes.sort(
+            key=lambda node: node.node.original_desc_id())
+        num_nodes_before = len(serial_ordered_tensor_nodes) + len(
+            serial_ordered_op_nodes)
+
+        new_serial_ordered_tensor_nodes = []
+        new_serial_ordered_op_nodes = []
+        for op_node in serial_ordered_op_nodes:
             tensor_nodes = []
             for tensor_node in op_node.inputs:
                 if tensor_node.is_var() \
                     and tensor_node.var() is not None \
                     and not _contains(self._serial_ordered_nodes, tensor_node):
                     tensor_nodes.append(tensor_node)
+                    new_serial_ordered_tensor_nodes.append(tensor_node)
             tensor_nodes.sort(key=lambda node: node.node.original_desc_id())
             self._serial_ordered_nodes.extend(tensor_nodes)
             self._serial_ordered_nodes.append(op_node)
+            new_serial_ordered_op_nodes.append(op_node)
             tensor_nodes = []
             for tensor_node in op_node.outputs:
                 if tensor_node.is_var() \
                     and tensor_node.var() is not None \
                     and not _contains(self._serial_ordered_nodes, tensor_node):
                     tensor_nodes.append(tensor_node)
+                    new_serial_ordered_tensor_nodes.append(tensor_node)
+            tensor_nodes.sort(key=lambda node: node.node.original_desc_id())
             self._serial_ordered_nodes.extend(tensor_nodes)
-        num_nodes_before = len(ordered_tensor_nodes) + len(ordered_op_nodes)
-        assert len(self._serial_ordered_nodes) == num_nodes_before, \
-            "The number of nodes before ordering is not the same after ordering."
+        new_serial_ordered_tensor_nodes.sort(
+            key=lambda node: node.node.original_desc_id())
+        new_serial_ordered_op_nodes.sort(
+            key=lambda node: node.node.original_desc_id())
+        self._serial_ordered_tensor_nodes = new_serial_ordered_tensor_nodes
+        self._serial_ordered_op_nodes = new_serial_ordered_op_nodes
+        assert len(self._serial_ordered_nodes) == len(
+            self._serial_ordered_tensor_nodes) + len(
+                self._serial_ordered_op_nodes)
+        self._serial_orphan_tensor_nodes = []
+        for tensor_node in serial_ordered_tensor_nodes:
+            if not _contains(self._serial_ordered_tensor_nodes, tensor_node):
+                self._serial_orphan_tensor_nodes.append(tensor_node)
+        if len(self._serial_ordered_nodes) != num_nodes_before:
+            print(
+                "WARNING: there are some orphan tensors or ops which are not used in the execution."
+            )
 
     def init_dist_attr_for_graph(self):
         assert self._is_initialized_for_program, \
@@ -352,9 +386,9 @@ def init_dist_attr_for_graph(self):
         if self._is_initialized_for_graph:
             return
         # Convert program to graph
+        set_flags({"FLAGS_convert_all_blocks": True})
         self._serial_graph = framework.IrGraph(
             core.Graph(self._serial_program.desc))
-        all_nodes = self._serial_graph.all_nodes()
         self.order_nodes_by_program_order()
         for node in self.serial_ordered_nodes:
             if node.is_var() and node.var() is not None:
@@ -365,10 +399,11 @@ def init_dist_attr_for_graph(self):
                     if tensor_id == cur_tensor_id \
                         or tensor_id == cur_dist_tensor.serial_tensor.desc.original_id():
                         dist_tensor = cur_dist_tensor
-                        self._node_id_to_tensor_id[node.id()] = cur_tensor_id
+                        self._node_id_to_tensor_id[_node_id(
+                            node)] = cur_tensor_id
                 assert dist_tensor is not None, \
                     "Tensor must have a distributed tensor after the initialization for program."
-                serial_tensor_node_id = node.id()
+                serial_tensor_node_id = _node_id(node)
                 new_dist_tensor = DistributedTensor(dist_tensor.serial_tensor,
                                                     dist_tensor.dist_attr)
                 self._dist_tensors_for_graph[
@@ -381,10 +416,10 @@ def init_dist_attr_for_graph(self):
                     if op_id == cur_op_id \
                         or op_id == cur_dist_op.serial_op.desc.original_id():
                         dist_op = cur_dist_op
-                        self._node_id_to_op_id[node.id()] = cur_op_id
+                        self._node_id_to_op_id[_node_id(node)] = cur_op_id
                 assert dist_op is not None, \
                     "Operator must have a distributed operator after the initialization for program."
-                serial_op_node_id = node.id()
+                serial_op_node_id = _node_id(node)
                 new_dist_op = DistributedOperator(dist_op.serial_op,
                                                   dist_op.dist_attr)
                 self._dist_ops_for_graph[serial_op_node_id] = new_dist_op
@@ -402,10 +437,11 @@ def copy_dist_attr_from_graph_to_program(self):
         assert self._is_initialized_for_program and self._is_initialized_for_graph, \
             "Both program and graph must be initialized."
         updated_tensors = {}
-        all_nodes = self._serial_graph.all_nodes()
+        # all_nodes = self._serial_graph.all_nodes()
+        all_nodes = self._serial_ordered_nodes
         for node in all_nodes:
             if node.is_var() and node.var() is not None:
-                tensor_id = self._node_id_to_tensor_id[node.id()]
+                tensor_id = self._node_id_to_tensor_id[_node_id(node)]
                 updated = updated_tensors.get(tensor_id, False)
                 # If a var has multiples var nodes in graph, only use the first one for now
                 if not updated:
@@ -416,16 +452,31 @@ def copy_dist_attr_from_graph_to_program(self):
                     dist_tensor_for_program.dist_attr = tensor_dist_attr_for_graph
                     updated_tensors[tensor_id] = True
             if node.is_op() and node.op() is not None:
-                op_id = self._node_id_to_op_id[node.id()]
+                op_id = self._node_id_to_op_id[_node_id(node)]
                 op_dist_attr_for_graph = self.get_op_dist_attr_for_graph(node)
                 dist_op_for_program = self._dist_ops_for_program[op_id]
                 dist_op_for_program.dist_attr = op_dist_attr_for_graph
+        # TODO: the completion algorithm will skip orphan tensors, 
+        # here we just set there process_mesh to the first one.
+        for orphan_node in self._serial_orphan_tensor_nodes:
+            serial_tensor_id = orphan_node.var().id()
+            dist_tensor = self._dist_tensors_for_program.get(serial_tensor_id,
+                                                             None)
+            if dist_tensor:
+                dist_tensor.dist_attr.process_mesh = self._process_meshes[0]
+            else:
+                serial_tensor_id = orphan_node.var().original_id()
+                dist_tensor = self._dist_tensors_for_program.get(
+                    serial_tensor_id, None)
+                dist_tensor.dist_attr.process_mesh = self._process_meshes[0]
 
     def amend_dist_attr_for_program(self):
         for dist_tensor in self._dist_tensors_for_program.values():
             serial_tensor = dist_tensor.serial_tensor
             dist_attr = dist_tensor.dist_attr
-            if serial_tensor.type == core.VarDesc.VarType.READER:
+            if serial_tensor.type == core.VarDesc.VarType.READER \
+                or serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+                or serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES:
                 tensor_shape = []
             else:
                 tensor_shape = serial_tensor.shape
@@ -446,6 +497,7 @@ def amend_dist_attr_for_program(self):
                     tensor_shape = []
                 else:
                     if dist_op.get_serial_input(arg_name).type == core.VarDesc.VarType.READER \
+                        or dist_op.get_serial_input(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
                         or dist_op.serial_op.type == "create_py_reader":
                         tensor_shape = []
                     else:
@@ -459,8 +511,9 @@ def amend_dist_attr_for_program(self):
                         and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
                         dims_mapping[i] = -1
             for arg_name in serial_op.output_arg_names:
-                if dist_op.get_serial_output(
-                        arg_name).type == core.VarDesc.VarType.READER:
+                if dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.READER \
+                    or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+                    or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.STEP_SCOPES:
                     tensor_shape = []
                 else:
                     tensor_shape = dist_op.get_serial_output(arg_name).shape
@@ -498,7 +551,8 @@ def __deepcopy__(self, memo):
         for k, v in self.__dict__.items():
             if k == "_serial_program" or k == "_serial_graph" \
                 or k == "_dist_main_programs" or k == "_dist_startup_programs" \
-                or k == "_serial_ordered_nodes":
+                or k == "_serial_ordered_nodes" or k == "_serial_ordered_tensor_nodes" \
+                or k == "_serial_ordered_op_nodes":
                 setattr(result, k, v)
             else:
                 setattr(result, k, copy.deepcopy(v, memo))
diff --git a/python/paddle/distributed/auto_parallel/dist_op.py b/python/paddle/distributed/auto_parallel/dist_op.py
index 67de298564afc..a2c2748a8cea3 100644
--- a/python/paddle/distributed/auto_parallel/dist_op.py
+++ b/python/paddle/distributed/auto_parallel/dist_op.py
@@ -76,7 +76,8 @@ def _init_default_dist_attr(self):
             if tensor is None:
                 tensor_shape = []
             else:
-                if tensor.type == core.VarDesc.VarType.READER:
+                if tensor.type == core.VarDesc.VarType.READER \
+                    or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
                     tensor_shape = []
                 else:
                     tensor_shape = tensor.shape
@@ -86,7 +87,9 @@ def _init_default_dist_attr(self):
                                                        tensor_dims_mapping)
         for tensor_name in self._serial_op.output_arg_names:
             tensor = self._serial_op.block._var_recursive(tensor_name)
-            if tensor.type == core.VarDesc.VarType.READER or tensor.type == core.VarDesc.VarType.STEP_SCOPES:
+            if tensor.type == core.VarDesc.VarType.READER \
+                or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+                or tensor.type == core.VarDesc.VarType.STEP_SCOPES:
                 tensor_shape = []
             else:
                 tensor_shape = tensor.shape
@@ -95,6 +98,8 @@ def _init_default_dist_attr(self):
                 tensor_dims_mapping = [-1 for _ in range(len(tensor_shape))]
                 self._dist_attr.set_output_dims_mapping(tensor_name,
                                                         tensor_dims_mapping)
+        if self._dist_attr.op_type is None:
+            self._dist_attr.op_type = self.serial_op.type
         if self._dist_attr.impl_type is None:
             self._dist_attr.impl_type = "default"
         if self._dist_attr.impl_idx is None:
@@ -134,12 +139,16 @@ def _filter_dist_attr(self, dist_attr):
         return new_dist_attr
 
     def validate_dist_attr(self):
-        if "read" in self.serial_op.type:
+        if "read" in self.serial_op.type or "while" == self.serial_op.type:
             return True
         for name in self.serial_op.input_arg_names:
             input_dist_attr = self.dist_attr.get_input_dist_attr(name)
             dims_mapping = input_dist_attr.dims_mapping
-            shape = self.get_serial_input(name).shape
+            if self.get_serial_input(
+                    name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+                shape = []
+            else:
+                shape = self.get_serial_input(name).shape
             if len(shape) != len(dims_mapping):
                 return False
             for i in range(len(dims_mapping)):
@@ -155,7 +164,11 @@ def validate_dist_attr(self):
         for name in self.serial_op.output_arg_names:
             output_dist_attr = self.dist_attr.get_output_dist_attr(name)
             dims_mapping = output_dist_attr.dims_mapping
-            shape = self.get_serial_output(name).shape
+            if self.get_serial_output(name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY\
+                or self.get_serial_output(name).type == core.VarDesc.VarType.STEP_SCOPES:
+                shape = []
+            else:
+                shape = self.get_serial_output(name).shape
             if len(shape) != len(dims_mapping):
                 return False
             for i in range(len(dims_mapping)):
@@ -241,14 +254,14 @@ def __init__(self, serial_module, dist_attr=None):
 
     def __call__(self, *args, **kwargs):
         from .dist_context import get_default_distributed_context
-        main_prog = paddle.fluid.default_main_program()
-        main_block = main_prog.global_block()
-        op_size = len(main_block.ops)
+        default_prog = paddle.fluid.default_main_program()
+        cur_block = default_prog.current_block()
+        op_size = len(cur_block.ops)
         output = self._serial_module(*args, **kwargs)
-        new_op_size = len(main_block.ops)
+        new_op_size = len(cur_block.ops)
         default_dist_ctx = get_default_distributed_context()
         for idx in range(op_size, new_op_size):
-            op = main_block.ops[idx]
+            op = cur_block.ops[idx]
             dist_op = DistributedOperator(op, self._dist_attr)
             dist_op.dist_attr.mark_annotated_as(self._dist_attr)
             default_dist_ctx.add_dist_op_for_program(dist_op)
diff --git a/python/paddle/distributed/auto_parallel/dist_tensor.py b/python/paddle/distributed/auto_parallel/dist_tensor.py
index 5e3c852699ab6..a42ce863492b3 100644
--- a/python/paddle/distributed/auto_parallel/dist_tensor.py
+++ b/python/paddle/distributed/auto_parallel/dist_tensor.py
@@ -184,7 +184,9 @@ def dist_attr(self, dist_attr):
 
     def _init_default_dist_attr(self):
         if self._dist_attr.dims_mapping is None:
-            if self.serial_tensor.type == core.VarDesc.VarType.READER:
+            if self.serial_tensor.type == core.VarDesc.VarType.READER \
+                or self.serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+                or self.serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES:
                 tensor_shape = []
             else:
                 tensor_shape = self._serial_tensor.shape
@@ -192,7 +194,9 @@ def _init_default_dist_attr(self):
             self._dist_attr.dims_mapping = tensor_dims_mapping
 
     def validate_dist_attr(self):
-        if self.serial_tensor.type == core.VarDesc.VarType.READER:
+        if self.serial_tensor.type == core.VarDesc.VarType.READER \
+            or self.serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+            or self.serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES:
             return True
         tensor_shape = self.serial_tensor.shape
         if len(tensor_shape) != len(self.dist_attr.dims_mapping):
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index 56beb8957415d..6bd1c5527a99e 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -259,7 +259,7 @@ def fit(self, train_data, batch_size=1, epochs=1, steps_per_epoch=1000):
                     "train_" + name: val
                     for name, val in logs.items()
                 }
-                self._logger.info(logs)
+                self._logger.info(train_logs)
 
     def _train_step(self, data):
         logs = {}
diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py
index 4b079e7b6b575..47f76353e4655 100644
--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -17,7 +17,9 @@
 
 _g_distributed_operator_impl_containers = {}
 
-_g_elementwise_ops = ["elementwise_add", "gelu", "dropout", "cast"]
+_g_elementwise_ops = [
+    "elementwise_add", "gelu", "dropout", "cast", "gather", "concat"
+]
 BACKWARD_ONLY_DIST_OPS = {'check_finite_and_unscale', 'update_loss_scaling'}
 
 
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py
index 4e977007261a7..de6d018d60521 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py
@@ -55,9 +55,14 @@ def is_input_compatible(self, dist_op):
         op_dist_attr = dist_op.dist_attr
         for arg_name in op_desc.input_arg_names():
             serial_tensor = dist_op.get_serial_input(arg_name)
-            if serial_tensor.is_parameter:
-                continue
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
+            if serial_tensor.is_parameter:
+                for mapping in dims_mapping:
+                    if mapping != -1:
+                        return False
+                # continue
+                # if len(dims_mapping) < 1:
+                #     continue
             if len(dims_mapping) > 1:
                 for mapping in dims_mapping[1:]:
                     if mapping != -1:
@@ -73,9 +78,14 @@ def is_output_compatible(self, dist_op):
             xshape_arg_names = op_desc.output("XShape")
         for arg_name in op_desc.output_arg_names():
             serial_tensor = dist_op.get_serial_output(arg_name)
-            if serial_tensor.is_parameter:
-                continue
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
+            if serial_tensor.is_parameter:
+                for mapping in dims_mapping:
+                    if mapping != -1:
+                        return False
+                # continue
+                # if len(dims_mapping) < 1:
+                #     continue
             if arg_name not in xshape_arg_names:
                 if len(dims_mapping) > 1:
                     for mapping in dims_mapping[1:]:
@@ -104,7 +114,8 @@ def is_auto_compatible(self, dist_op):
                 for mapping in dims_mapping[1:]:
                     if mapping != -1:
                         return False
-            batch_dim_mappings.append(dims_mapping[0])
+            if len(dims_mapping) >= 1:
+                batch_dim_mappings.append(dims_mapping[0])
 
         # Check output compatibility
         output_names = op_desc.output_names()
@@ -121,7 +132,8 @@ def is_auto_compatible(self, dist_op):
                     for mapping in dims_mapping[1:]:
                         if mapping != -1:
                             return False
-                batch_dim_mappings.append(dims_mapping[0])
+                if len(dims_mapping) >= 1:
+                    batch_dim_mappings.append(dims_mapping[0])
             else:
                 if dims_mapping[0] != -1:
                     return False
@@ -129,7 +141,8 @@ def is_auto_compatible(self, dist_op):
                     for mapping in dims_mapping[2:]:
                         if mapping != -1:
                             return False
-                batch_dim_mappings.append(dims_mapping[1])
+                if len(dims_mapping) >= 2:
+                    batch_dim_mappings.append(dims_mapping[1])
 
         # Check batch dim mapping compatibility
         if not all(batch_dim_mappings[0] == dim_mapping
@@ -143,7 +156,9 @@ def update_dims_mapping(self, dist_op):
         op_desc = dist_op.serial_op.desc
         op_dist_attr = dist_op.dist_attr
         # The following statement will be replaced by a more elegent way
-        if op_desc.type() == "shape" or op_desc.type() == "slice":
+        if op_desc.type() == "shape" \
+            or op_desc.type() == "slice" \
+                or op_desc.type() == "while":
             return False
         output_names = op_desc.output_names()
         xshape_arg_names = []
@@ -155,17 +170,22 @@ def update_dims_mapping(self, dist_op):
             if serial_tensor.is_parameter:
                 continue
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-            batch_dim_mappings.append(dims_mapping[0])
+            if len(dims_mapping) >= 1:
+                batch_dim_mappings.append(dims_mapping[0])
         for arg_name in op_desc.output_arg_names():
             serial_tensor = dist_op.get_serial_output(arg_name)
             if serial_tensor.is_parameter:
                 continue
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
             if arg_name not in xshape_arg_names:
-                batch_dim_mappings.append(dims_mapping[0])
+                if len(dims_mapping) >= 1:
+                    batch_dim_mappings.append(dims_mapping[0])
             else:
                 batch_dim_mappings.append(dims_mapping[1])
 
+        if not batch_dim_mappings:
+            return changed
+
         compatible_dim_mapping = compute_compatible_dim_mapping(
             batch_dim_mappings)
         assert compatible_dim_mapping is not None, "There is no compatible dim mapping."
@@ -174,7 +194,8 @@ def update_dims_mapping(self, dist_op):
             if serial_tensor.is_parameter:
                 continue
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-            if compatible_dim_mapping != dims_mapping[0]:
+            if len(dims_mapping
+                   ) >= 1 and compatible_dim_mapping != dims_mapping[0]:
                 dims_mapping[0] = compatible_dim_mapping
                 changed = True
         for arg_name in op_desc.output_arg_names():
@@ -183,11 +204,13 @@ def update_dims_mapping(self, dist_op):
                 continue
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
             if arg_name not in xshape_arg_names:
-                if compatible_dim_mapping != dims_mapping[0]:
+                if len(dims_mapping
+                       ) >= 1 and compatible_dim_mapping != dims_mapping[0]:
                     dims_mapping[0] = compatible_dim_mapping
                     changed = True
             else:
-                if compatible_dim_mapping != dims_mapping[1]:
+                if len(dims_mapping
+                       ) >= 2 and compatible_dim_mapping != dims_mapping[1]:
                     dims_mapping[1] = compatible_dim_mapping
                     changed = True
 
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
index 058ae1d0a9fd5..c92142cf7384d 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -1432,7 +1432,6 @@ def is_input_compatible(self, dist_op):
         if is_valid_list_index(y_dims_mapping,
                                -2) and is_dim_shard(y_dims_mapping[-2]):
             return False
-
         return True
 
     def is_output_compatible(self, dist_op):
diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py
index 4cc710b226d8f..c6afcfec8a008 100644
--- a/python/paddle/distributed/auto_parallel/reshard.py
+++ b/python/paddle/distributed/auto_parallel/reshard.py
@@ -29,6 +29,7 @@
 
 # NOTE: If op in _g_special_ops, it will not be resharded. 
 _g_special_ops = ['check_finite_and_unscale', 'update_loss_scaling']
+while_block_info = {}
 
 
 class AllGatherOpDesc:
@@ -280,8 +281,20 @@ def _is_overlapped(shape_x, shape_y):
     return overlapped
 
 
-def _need_reshard(dist_tensor, dist_op, op_input=True):
+def _need_reshard(dist_tensor,
+                  dist_op,
+                  actual_process_mesh,
+                  program,
+                  dist_context,
+                  op_input=True):
     """Judge the tensor whether needs to be resharded."""
+
+    def _is_unshard(dims_mapping):
+        for dim in dims_mapping:
+            if dim != -1:
+                return False
+        return True
+
     is_reshard = False
     tensor_dist_attr = dist_tensor.dist_attr
     tensor_name = dist_tensor.serial_tensor.name
@@ -289,32 +302,74 @@ def _need_reshard(dist_tensor, dist_op, op_input=True):
     tensor_process_mesh = tensor_dist_attr.process_mesh
     op_dist_attr = dist_op.dist_attr
     op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name)
-    op_process_mesh = op_dist_attr.process_mesh
+    op_process_mesh = actual_process_mesh
     if op_input:
         op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name)
-        op_process_mesh = op_dist_attr.process_mesh
         if all(
                 map(lambda x: x is not None, [
                     tensor_dims_mapping, tensor_process_mesh,
                     op_input_dims_mapping, op_process_mesh
                 ])):
-            if tensor_dims_mapping != op_input_dims_mapping or tensor_process_mesh != op_process_mesh:
-                is_reshard = True
+            # dims_mapping
+            if tensor_dims_mapping != op_input_dims_mapping:
+                if dist_op.serial_op.type == "while":
+                    sub_block = program.blocks[dist_op.serial_op.attr(
+                        "sub_block").id]
+                    for op in sub_block.ops:
+                        for var_name in op.input_arg_names:
+                            if var_name == tensor_name:
+                                dist_op_attr = dist_context.get_dist_op_for_program(
+                                    op).dist_attr
+                                var_dims_mapping = dist_op_attr.get_input_dims_mapping(
+                                    var_name)
+                                if var_dims_mapping != tensor_dims_mapping:
+                                    is_reshard = True
+                                    break
+                else:
+                    is_reshard = True
+            # process_mesh
+            if tensor_process_mesh != op_process_mesh:
+                # when processes length is not the same, the dims mapping must be replicative now
+                if len(tensor_process_mesh.processes) != len(
+                        op_process_mesh.processes):
+                    assert _is_unshard(tensor_dims_mapping)
+                    assert _is_unshard(op_input_dims_mapping)
+                else:
+                    if dist_tensor.serial_tensor.dtype == paddle.bool:
+                        raise ValueError("Bool var is not supported reshard.")
+
+                    # for while op, it should find the process mesh of op actually used the tensor as input
+                    if dist_op.serial_op.type == "while":
+                        sub_block = program.blocks[dist_op.serial_op.attr(
+                            "sub_block").id]
+                        for op in sub_block.ops:
+                            for var_name in op.input_arg_names:
+                                if var_name == tensor_name:
+                                    dist_op_attr = dist_context.get_dist_op_for_program(
+                                        op).dist_attr
+                                    process_mesh = dist_op_attr.process_mesh
+                                    if process_mesh == op_process_mesh:
+                                        is_reshard = True
+                                        break
+                    else:
+                        is_reshard = True
     else:
         op_output_dims_mapping = op_dist_attr.get_output_dims_mapping(
             tensor_name)
-        op_process_mesh = op_dist_attr.process_mesh
         if all(
                 map(lambda x: x is not None, [
                     tensor_dims_mapping, tensor_process_mesh,
                     op_output_dims_mapping, op_process_mesh
                 ])):
             if tensor_process_mesh != op_process_mesh:
+                if dist_tensor.serial_tensor.dtype == paddle.bool:
+                    raise ValueError("Bool var is not supported reshard.")
                 is_reshard = True
             if tensor_dims_mapping != op_output_dims_mapping:
                 raise ValueError(
                     "It is not supported that tensor dims mapping is different from op output dims mapping."
                 )
+
     return is_reshard
 
 
@@ -329,13 +384,14 @@ def _compute_complete_shape(slice_shape, process_shape, dims_mapping):
     return complete_shape
 
 
-def find_op_desc_seq(dist_tensor, dist_op):
+def find_op_desc_seq(dist_tensor, dist_op, actual_process_mesh, batch_size):
     """
     Find the op description sequence to reshard the source tensor for matching the op requirement.
 
     Args:
         dist_tensor (DistributedTensor): A distributed tensor.
         dist_op (DistributedOperator): A distributed operator.
+        actual_process_mesh (ProcessMesh): The actual op process mesh.
 
     Returns:
         Dict, the dict represents the required op description sequence corresponding to process, The key of dict is
@@ -350,11 +406,16 @@ def find_op_desc_seq(dist_tensor, dist_op):
     source_process_shape = source_process_mesh.topology
 
     op_dist_attr = dist_op.dist_attr
-    target_process_mesh = op_dist_attr.process_mesh
+    target_process_mesh = actual_process_mesh
     target_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name)
     target_process_group = target_process_mesh.processes
     target_process_shape = target_process_mesh.topology
 
+    if source_tensor.shape[0] < 0:
+        new_shape = list(source_tensor.shape)
+        new_shape[0] = batch_size
+        source_tensor.desc.set_shape(new_shape)
+
     complete_shape = _compute_complete_shape(
         source_tensor.shape, source_process_shape, source_dims_mapping)
     op_desc_seq = {}
@@ -503,7 +564,7 @@ def find_op_desc_seq(dist_tensor, dist_op):
     return op_desc_seq
 
 
-def _insert_send_op(block, idx, tensor, dst):
+def _insert_send_op(block, idx, tensor, dst, op_role):
     """Insert send op into block at the given index."""
     op_type = 'send_v2'
     block._insert_op(
@@ -514,10 +575,11 @@ def _insert_send_op(block, idx, tensor, dst):
             'ring_id': 0,
             'peer': dst,
             'use_calc_stream': True,
+            'op_role': op_role
         })
 
 
-def _insert_recv_op(block, idx, tensor, src):
+def _insert_recv_op(block, idx, tensor, src, op_role):
     """Insert recv op into block at the given index."""
     op_type = 'recv_v2'
     block._insert_op(
@@ -531,14 +593,16 @@ def _insert_recv_op(block, idx, tensor, src):
             'out_shape': tensor.shape,
             'dtype': tensor.dtype,
             'use_calc_stream': True,
+            'op_role': op_role
         })
 
 
-def _insert_concat_op(block, idx, tensors, axis):
+def _insert_concat_op(block, idx, tensors, axis, op_role):
     """Insert concat op into block at the given block."""
     inputs = {'X': tensors}
     attrs = {}
     attrs['axis'] = axis
+    attrs['op_role'] = op_role
     helper = LayerHelper('concat', **locals())
     with paddle.static.program_guard(block.program):
         out = helper.create_variable_for_type_inference(
@@ -548,7 +612,8 @@ def _insert_concat_op(block, idx, tensors, axis):
     return out
 
 
-def _insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name):
+def _insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name,
+                     op_role):
     """Insert slice op into block at the given block."""
     inputs = {'Input': tensor}
     infer_flags = list(1 for i in range(len(axes)))
@@ -556,24 +621,23 @@ def _insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name):
         "axes": axes,
         "starts": starts,
         "ends": ends,
-        "infer_flags": infer_flags
+        "infer_flags": infer_flags,
+        'op_role': op_role
     }
     helper = LayerHelper('slice', **locals())
     out = block.create_var(
-        name=new_var_name,
-        dtype=tensor.dtype,
-        type=core.VarDesc.VarType.LOD_TENSOR)
+        name=new_var_name, dtype=tensor.dtype, type=tensor.type)
     block._insert_op(
         idx, type="slice", inputs=inputs, outputs={'Out': [out]}, attrs=attrs)
     return out
 
 
-def _insert_split_op(block, idx, tensor, num_or_sections):
+def _insert_split_op(block, idx, tensor, num_or_sections, op_role):
     """Insert split op into block at the given index."""
     helper = LayerHelper('split', **locals())
     input_shape = tensor.shape
     inputs = {'X': tensor}
-    attrs = {'num': num_or_sections, "axis": 0}
+    attrs = {'num': num_or_sections, 'axis': 0, 'op_role': op_role}
     with paddle.static.program_guard(block.program):
         outs = [
             helper.create_variable_for_type_inference(
@@ -584,7 +648,7 @@ def _insert_split_op(block, idx, tensor, num_or_sections):
     return outs
 
 
-def _insert_allgather_op(block, idx, tensor, ranks):
+def _insert_allgather_op(block, idx, tensor, ranks, op_role):
     """Insert allgather op into block at the given index."""
 
     def _insert_fill_constant_op(block, idx):
@@ -597,6 +661,7 @@ def _insert_fill_constant_op(block, idx):
         attrs['str_value'] = str(int("1"))
         attrs['value'] = int("1")
         attrs['dtype'] = out.dtype
+        attrs['op_role'] = op_role
         utils.get_shape_tensor_inputs(
             inputs=inputs, attrs=attrs, shape=[0], op_type='fill_constant')
         block._insert_op(
@@ -625,14 +690,16 @@ def _insert_fill_constant_op(block, idx):
             inputs={'X': [fill_constant_out]},
             outputs={'Out': [fill_constant_out]},
             attrs={'ring_id': 0,
-                   'use_calc_stream': True})
+                   'use_calc_stream': True,
+                   'op_role': op_role})
 
         # insert c_sync_calc_stream op
         block._insert_op(
             idx + 2,
             type="c_sync_calc_stream",
             inputs={'X': [fill_constant_out]},
-            outputs={'Out': [fill_constant_out]})
+            outputs={'Out': [fill_constant_out]},
+            attrs={'op_role': op_role})
         idx_offset = 3
 
     # insert c_allgather op
@@ -649,20 +716,21 @@ def _insert_fill_constant_op(block, idx):
         attrs={
             'ring_id': group.id,
             'use_calc_stream': True,
-            'nranks': group.nranks
+            'nranks': group.nranks,
+            'op_role': op_role
         })
     idx_offset += 1
 
     # insert split op
     split_out = _insert_split_op(block, idx + idx_offset, allgather_out,
-                                 group.nranks)
+                                 group.nranks, op_role)
     idx_offset += 1
     tensor_list.extend(split_out)
     return tensor_list, idx_offset
 
 
 def _concat_partitions_with_op(partition_tensor_list, tensor, partition_index,
-                               block, idx):
+                               block, idx, op_role):
     """Concat the tensors and insert concat op."""
     if not partition_tensor_list:
         partition_tensor_list.append((tensor, partition_index))
@@ -674,13 +742,13 @@ def _concat_partitions_with_op(partition_tensor_list, tensor, partition_index,
                 partition_tensor_list[i][1], partition_index)
             if concat_axis != -1:
                 has_concat = True
-                _ = _insert_concat_op(block, idx[0], [partition_tensor_list[i][0], tensor], concat_axis) \
+                _ = _insert_concat_op(block, idx[0], [partition_tensor_list[i][0], tensor], concat_axis, op_role) \
                     if first_order == 0 else \
-                    _insert_concat_op(block, idx[0], [tensor, partition_tensor_list[i][0]], concat_axis)
+                    _insert_concat_op(block, idx[0], [tensor, partition_tensor_list[i][0]], concat_axis, op_role)
                 partition_tensor_list.pop(i)
                 idx[0] += 1
                 _concat_partitions_with_op(partition_tensor_list, _,
-                                           new_partition, block, idx)
+                                           new_partition, block, idx, op_role)
                 break
             i += 1
         if not has_concat:
@@ -692,8 +760,47 @@ def _concat_partitions_with_op(partition_tensor_list, tensor, partition_index,
 HAS_ALLGATHER = {}
 
 
-def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
-                  dist_context):
+def _get_while_op_actual_process_mesh(op, program, rank_id, dist_context):
+    """Get the while op actual Process mesh corresponding to rank"""
+    assert op.type == "while"
+    while_op_process_mesh = dist_context.get_dist_op_for_program(
+        op).dist_attr.process_mesh
+    sub_block = program.blocks[op.attr("sub_block").id]
+    ops = sub_block.ops
+    actual_process_mesh = None
+    for op in ops:
+        dist_op = dist_context.get_dist_op_for_program(op)
+        if not dist_op:
+            continue
+        process_mesh = dist_op.dist_attr.process_mesh
+        if process_mesh == while_op_process_mesh:
+            continue
+        if rank_id in process_mesh.processes:
+            raw_process_mesh = process_mesh
+            break
+
+    if actual_process_mesh is None and rank_id in while_op_process_mesh.processes:
+        actual_process_mesh = while_op_process_mesh
+
+    assert actual_process_mesh is not None
+    return actual_process_mesh
+
+
+def _get_var(var_name, block, program):
+    """Get var in the parent block if not found in the current block"""
+    var = None
+    if var_name in block.vars:
+        var = block.vars[var_name]
+    else:
+        parent_block = program.blocks[block.parent_idx]
+        if var_name in parent_block.vars:
+            var = parent_block.vars[var_name]
+    assert var is not None
+    return var
+
+
+def parse_op_desc(block, rank_id, op_desc_seq, var_name, reshard_op,
+                  dist_context, program, actual_process_mesh):
     """Parse op desc sequence and insert op in the block"""
     global HAS_SENT
     global HAS_RECV
@@ -703,9 +810,6 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
     if rank_id not in op_desc_seq.keys():
         return
     op_desc_list = op_desc_seq[rank_id]
-    block = program.global_block()
-    assert var_name in block.vars.keys(
-    ), "The {} cannot be found in the {} program.".format(var_name, rank_id)
 
     idx = None
     for index, op in list(enumerate(block.ops)):
@@ -716,7 +820,7 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
         rank_id)
 
     matched_op = block.ops[idx]
-    source_tensor = block.vars[var_name]
+    source_tensor = _get_var(var_name, block, program)
     for op_desc in op_desc_list:
         if isinstance(op_desc, AllGatherOpDesc):  # noqa: F401
             if var_name not in HAS_ALLGATHER.keys():
@@ -724,7 +828,8 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
             if not HAS_ALLGATHER[var_name] or op_desc.group not in list(
                     map(lambda x: x[0], HAS_ALLGATHER[var_name])):
                 tensor_list, idx_offset = _insert_allgather_op(
-                    block, idx, source_tensor, op_desc.group)
+                    block, idx, source_tensor, op_desc.group,
+                    reshard_op.attr('op_role'))
                 idx += idx_offset
                 tensor_name_list = [var.name for var in tensor_list]
                 HAS_ALLGATHER[var_name].append(
@@ -743,7 +848,8 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
             if var_name not in HAS_SENT.keys():
                 HAS_SENT[var_name] = []
             if op_desc.dst not in HAS_SENT[var_name]:
-                _insert_send_op(block, idx, source_tensor, op_desc.dst)
+                _insert_send_op(block, idx, source_tensor, op_desc.dst,
+                                reshard_op.attr('op_role'))
                 idx += 1
                 HAS_SENT[var_name].append(op_desc.dst)
 
@@ -758,8 +864,10 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
                 recv_tensor = block.create_var(
                     name=unique_name.generate(var_name + "@recv"),
                     shape=shape,
-                    dtype=source_tensor.dtype)
-                _insert_recv_op(block, idx, recv_tensor, op_desc.src)
+                    dtype=source_tensor.dtype,
+                    type=source_tensor.type)
+                _insert_recv_op(block, idx, recv_tensor, op_desc.src,
+                                reshard_op.attr('op_role'))
                 tensor_list.append(recv_tensor)
                 idx += 1
                 HAS_RECV[var_name][op_desc.src] = recv_tensor
@@ -772,7 +880,7 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
             for index, tensor in enumerate(tensor_list):
                 _concat_partitions_with_op(partition_tensor_list, tensor,
                                            partition_index_list[index], block,
-                                           idx_list)
+                                           idx_list, reshard_op.attr('op_role'))
             idx = idx_list[0]
 
         elif isinstance(op_desc, SliceOpDesc):
@@ -787,11 +895,11 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
                 starts=op_desc.starts,
                 ends=op_desc.ends,
                 axes=op_desc.axes,
-                new_var_name=new_name)
+                new_var_name=new_name,
+                op_role=reshard_op.attr('op_role'))
 
             tensor_attr = TensorDistributedAttribute()
-            process_mesh = dist_context.get_op_dist_attr_for_program(
-                matched_op).process_mesh
+            process_mesh = actual_process_mesh
             dims_mapping = dist_context.get_op_dist_attr_for_program(
                 matched_op).get_input_dims_mapping(var_name)
             tensor_attr.dims_mapping = dims_mapping
@@ -799,11 +907,29 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
             dist_context.set_tensor_dist_attr_for_program(target_tensor,
                                                           tensor_attr)
 
+            if op.type == "while":
+                global while_block_info
+                # var_reshard_mapping means the while op input need be changed to 
+                if "var_reshard_mapping" not in while_block_info[op.attr(
+                        "sub_block").id].keys():
+                    while_block_info[op.attr("sub_block").id][
+                        "var_reshard_mapping"] = {}
+                while_block_info[op.attr("sub_block").id][
+                    "var_reshard_mapping"][var_name] = target_tensor.name
+
             # rename op input name according to new name
             for op in block.ops:
                 for name in op.input_arg_names:
                     op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
                     if name == var_name and op_dist_attr is not None:
+                        if op.desc.id() == matched_op.desc.id():
+                            op.desc._rename_input(name, target_tensor.name)
+                            op_dist_attr.set_input_dims_mapping(
+                                target_tensor.name, dims_mapping)
+                            op_dist_attr.set_input_dist_attr(name, None)
+                            continue
+
+                        # NOTE: For op whose process mesh is a union, its input will not be renamed by other op reshard result now which means that it will have more reshard operation.
                         op_process_mesh = op_dist_attr.process_mesh
                         op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(
                             var_name)
@@ -819,102 +945,166 @@ def _remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id):
     not_remove_op_ref = [
         "create_py_reader", "create_double_buffer_reader", "read"
     ]
-    remove_op_idx = []
-    block = auto_parallel_main_prog.global_block()
-    ops = block.ops
-    vars = block.vars
-    for idx, op in enumerate(ops):
-        # handle read op in the pipeline scene specially, it will be removed in the future.
-        if op.type == "read":
-            dim_list = []
-            for var_name in op.output_arg_names:
-                dim_list.extend(vars[var_name].shape)
-            for i in range(idx, -1, -1):
-                if ops[i].type == "create_py_reader":
-                    ops[i]._set_attr("shape_concat", dim_list)
-                    break
-            continue
-
-        # replace the input and output of c_sync_comm_stream op when in pipeline scene.
-        if op.type == "c_sync_comm_stream":
-            need_save = []
-            for var_name in op.input_arg_names:
-                process_mesh = dist_context.get_tensor_dist_attr_for_program(
-                    vars[var_name]).process_mesh
-                if rank_id in process_mesh.processes:
-                    need_save.append(var_name)
-            if not need_save:
-                remove_op_idx.append(idx)
+    global while_block_info
+
+    # NOTE: The nested sub block is not be supported now.
+    remove_block_order = []
+    for block_idx in while_block_info:
+        remove_block_order.append(block_idx)
+
+    for block_idx, block in enumerate(auto_parallel_main_prog.blocks):
+        if block_idx not in remove_block_order:
+            remove_block_order.append(block_idx)
+
+    # the sub block should be removed first
+    for block_idx in remove_block_order:
+        remove_op_idx = []
+        block = auto_parallel_main_prog.blocks[block_idx]
+        ops = block.ops
+        vars = block.vars
+        for idx, op in enumerate(ops):
+            if op.type == "read":
+                dim_list = []
+                for var_name in op.output_arg_names:
+                    dim_list.extend(
+                        _get_var(var_name, block, auto_parallel_main_prog)
+                        .shape)
+                for i in range(idx, -1, -1):
+                    if ops[i].type == "create_py_reader":
+                        ops[i]._set_attr("shape_concat", dim_list)
+                        break
                 continue
 
-            proto = OpProtoHolder.instance().get_op_proto(op.type)
-            op.desc.set_input(proto.inputs[0].name, need_save)
-            op.desc.set_output(proto.outputs[0].name, need_save)
-            continue
+            # replace the input and output of c_sync_comm_stream op when in pipeline scene.
+            if op.type == "c_sync_comm_stream":
+                need_save = []
+                for var_name in op.input_arg_names:
+                    process_mesh = dist_context.get_tensor_dist_attr_for_program(
+                        _get_var(var_name, block,
+                                 auto_parallel_main_prog)).process_mesh
+                    if rank_id in process_mesh.processes:
+                        need_save.append(var_name)
+                if not need_save:
+                    remove_op_idx.append(idx)
+                    continue
 
-        # judge the other op whether should be removed.
-        op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
-        if op_dist_attr is not None:
-            op_process_mesh = op_dist_attr.process_mesh
-            if rank_id not in op_process_mesh.processes and op.type not in not_remove_op_ref:
-                remove_op_idx.append(idx)
+                proto = OpProtoHolder.instance().get_op_proto(op.type)
+                op.desc.set_input(proto.inputs[0].name, need_save)
+                op.desc.set_output(proto.outputs[0].name, need_save)
+                continue
 
-    for idx in remove_op_idx[::-1]:
-        block._remove_op(idx)
+            # judge the other op whether should be removed.
+            op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
+            if op_dist_attr is not None:
+                op_process_mesh = op_dist_attr.process_mesh
+                if rank_id not in op_process_mesh.processes and op.type not in not_remove_op_ref:
+                    remove_op_idx.append(idx)
+
+        for idx in remove_op_idx[::-1]:
+            block._remove_op(idx)
 
 
 def _remove_no_need_vars(auto_parallel_main_prog, dist_params_grads):
     """Remove no need vars in the main program"""
-    remove_vars = set()
-    block = auto_parallel_main_prog.global_block()
-    ops = block.ops
-    vars = block.vars
-    need_vars = set()
-    for op in ops:
-        for var_name in op.input_arg_names:
-            if var_name in vars:
-                need_vars.add(var_name)
-        for var_name in op.output_arg_names:
-            if var_name in vars:
-                need_vars.add(var_name)
-    for var in vars:
-        if var not in need_vars:
-            remove_vars.add(var)
-
-    # change dist_params_grads
-    param_grad_map = {}
-    for op in ops:
-        if int(op.attr('op_role')) == int(OpRole.Optimize):
-            if "Param" in op.input_names and "Grad" in op.input_names:
-                param_name = op.input("Param")[0]
-                grad_name = op.input("Grad")[0]
-                param_grad_map[param_name] = grad_name
-
-    need_remove_idx = []
-    for idx, item in enumerate(dist_params_grads):
-        if item[0].name not in param_grad_map.keys():
-            need_remove_idx.append(idx)
-
-    for idx in need_remove_idx[::-1]:
-        dist_params_grads.pop(idx)
-
-    idx = 0
-    while idx < len(dist_params_grads):
-        param_name = dist_params_grads[idx][0].name
-        grad_name = dist_params_grads[idx][1].name
-        if grad_name != param_grad_map[param_name]:
-            dist_params_grads[idx] = (vars[param_name],
-                                      vars[param_grad_map[param_name]])
-        idx += 1
+    for block_idx, block in enumerate(auto_parallel_main_prog.blocks):
+        remove_vars = set()
+        ops = block.ops
+        vars = block.vars
+        need_vars = set()
+        for op in ops:
+            for var_name in op.input_arg_names:
+                if var_name in vars:
+                    need_vars.add(var_name)
+            for var_name in op.output_arg_names:
+                if var_name in vars:
+                    need_vars.add(var_name)
+        for var in vars:
+            if var not in need_vars:
+                remove_vars.add(var)
+
+        # change dist_params_grads, the optimize op just in block 0.
+        if block_idx == 0:
+            param_grad_map = {}
+            for op in ops:
+                if int(op.attr('op_role')) == int(OpRole.Optimize):
+                    if "Param" in op.input_names and "Grad" in op.input_names:
+                        param_name = op.input("Param")[0]
+                        grad_name = op.input("Grad")[0]
+                        param_grad_map[param_name] = grad_name
+
+            need_remove_idx = []
+            for idx, item in enumerate(dist_params_grads):
+                if item[0].name not in param_grad_map.keys():
+                    need_remove_idx.append(idx)
+
+            for idx in need_remove_idx[::-1]:
+                dist_params_grads.pop(idx)
+
+            idx = 0
+            while idx < len(dist_params_grads):
+                param_name = dist_params_grads[idx][0].name
+                grad_name = dist_params_grads[idx][1].name
+                if grad_name != param_grad_map[param_name]:
+                    dist_params_grads[idx] = (vars[param_name],
+                                              vars[param_grad_map[param_name]])
+                idx += 1
 
-    for var in remove_vars:
-        block._remove_var(var)
+        for var in remove_vars:
+            block._remove_var(var)
+
+
+def _change_while_op_input_and_output(auto_parallel_main_prog, dist_context):
+    """Change while op input and output after the corresponding sub block ops removed"""
+    global while_block_info
+    for sub_block_idx in while_block_info:
+        sub_block = auto_parallel_main_prog.blocks[sub_block_idx]
+        parent_while_op_id = while_block_info[sub_block_idx]["op_id"]
+        parent_block = auto_parallel_main_prog.blocks[sub_block.parent_idx]
+
+        sub_block_op_inputs = set()
+        sub_block_op_outputs = []
+        for op in sub_block.ops:
+            # skip the input and output of operators inserted in the reshard phase
+            dist_op = dist_context.get_dist_op_for_program(op)
+            if dist_op:
+                for var_name in op.output_arg_names:
+                    if var_name not in sub_block_op_outputs:
+                        sub_block_op_outputs.append(var_name)
+                for var_name in op.input_arg_names:
+                    sub_block_op_inputs.add(var_name)
+
+        # find the while op
+        while_op = None
+        for op in parent_block.ops:
+            if op.desc.id() == parent_while_op_id and op.type == "while":
+                while_op = op
+                break
+
+        assert while_op is not None
+
+        # find the actual input and output of while op
+        proto = OpProtoHolder.instance().get_op_proto(while_op.type)
+        new_X = []
+        for var_name in while_op.input("X"):
+            if var_name in sub_block_op_inputs:
+                new_X.append(var_name)
+        assert new_X
+        while_op.desc.set_input(proto.inputs[0].name, new_X)
+
+        new_Out = []
+        for var_name in while_op.output("Out"):
+            for output_name in sub_block_op_outputs[::-1]:
+                if output_name.find(var_name) != -1:
+                    new_Out.append(output_name)
+        assert new_Out
+        while_op.desc.set_output(proto.outputs[0].name, new_Out)
 
 
 def remove_no_need_in_main(auto_parallel_main_prog, dist_context, rank_id,
                            dist_params_grads):
     """Remove no need vars and ops in the main program."""
     _remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id)
+    _change_while_op_input_and_output(auto_parallel_main_prog, dist_context)
     _remove_no_need_vars(auto_parallel_main_prog, dist_params_grads)
 
 
@@ -992,8 +1182,70 @@ def remove_no_need_in_startup(auto_parallel_main_prog,
         startup_block._remove_op(idx)
 
 
-def reshard(auto_parallel_main_prog, auto_parallel_startup_prog, rank_id,
-            dist_context, dist_params_grads):
+def _get_process_meshes(op, program, dist_context):
+    """Get all process meshes when op has sub block."""
+    assert op.has_attr("sub_block")
+    sub_block = program.blocks[op.attr("sub_block").id]
+    ops = sub_block.ops
+    op_process_mesh = dist_context.get_dist_op_for_program(
+        op).dist_attr.process_mesh
+    process_meshes = []
+    for op in ops:
+        dist_op = dist_context.get_dist_op_for_program(op)
+        if not dist_op:
+            continue
+        process_mesh = dist_op.dist_attr.process_mesh
+        if process_mesh not in process_meshes and process_mesh != op_process_mesh:
+            process_meshes.append(process_mesh)
+
+    if not process_meshes:
+        process_meshes.append(op_process_mesh)
+
+    return process_meshes
+
+
+def _is_condition_replicative(op, program, dist_context):
+    assert op.type == "while"
+    sub_block = program.blocks[op.attr("sub_block").id]
+    dist_op = dist_context.get_dist_op_for_program(op)
+    op_dist_attr = dist_op.dist_attr
+
+    # the dims mapping of condition tensor should be replicative
+    for var_name in op.input("Condition"):
+        var = _get_var(var_name, sub_block, program)
+        dist_tensor = dist_context.get_dist_tensor_for_program(var)
+        tensor_dist_attr = dist_tensor.dist_attr
+        var_dims_mapping = tensor_dist_attr.dims_mapping
+        for dim in var_dims_mapping:
+            if dim != -1:
+                return False
+
+    return True
+
+
+def _get_op_process_meshes(op, dist_context):
+    process_meshes = []
+    dist_op = dist_context.get_dist_op_for_program(op)
+    op_process_mesh = dist_op.dist_attr.process_mesh
+    for process_mesh in dist_context.process_meshes:
+        if set(process_mesh.processes) & (
+                set(op_process_mesh.processes)
+        ) and len(process_mesh.processes) <= len(op_process_mesh.processes):
+            process_meshes.append(process_mesh)
+
+    # it means the process mesh is not a union when process meshes is null
+    if not process_meshes:
+        process_meshes.append(op_process_mesh)
+
+    return process_meshes
+
+
+def reshard(auto_parallel_main_prog,
+            auto_parallel_startup_prog,
+            rank_id,
+            dist_context,
+            dist_params_grads,
+            batch_size=None):
     """
     Reshard tensor in the program according to its distributed attribute and corresponding op distributed attribute.
 
@@ -1019,65 +1271,137 @@ def _is_special_op(op):
             return True
         return False
 
-    block = auto_parallel_main_prog.global_block()
-    idx = 0
-    while idx < len(block.ops):
-        pre_op_count = len(block.ops)
-        op = block.ops[idx]
+    global while_block_info
+    for block_idx, block in enumerate(auto_parallel_main_prog.blocks):
+        if block_idx in while_block_info:
+            if "var_reshard_mapping" in while_block_info[block_idx]:
+                var_reshard_mapping = while_block_info[block_idx][
+                    "var_reshard_mapping"]
+                for op in block.ops:
+                    for var_name in op.input_arg_names:
+                        if var_name in var_reshard_mapping:
+                            op.desc._rename_input(var_name,
+                                                  var_reshard_mapping[var_name])
+                            dist_op = dist_context.get_dist_op_for_program(op)
+                            op_dist_attr = dist_op.dist_attr
+                            if op_dist_attr.process_mesh == while_block_info[
+                                    block_idx]["actual_process_mesh"]:
+                                dims_mapping = op_dist_attr.get_input_dims_mapping(
+                                    var_name)
+                                op_dist_attr.set_input_dims_mapping(
+                                    var_reshard_mapping[var_name], dims_mapping)
+                                op_dist_attr.set_input_dist_attr(var_name, None)
+
+                    # the outputs also need to be renamed when the output name is the same with input name
+                    for var_name in op.output_arg_names:
+                        if var_name in var_reshard_mapping:
+                            op.desc._rename_output(
+                                var_name, var_reshard_mapping[var_name])
+                            dist_op = dist_context.get_dist_op_for_program(op)
+                            op_dist_attr = dist_op.dist_attr
+                            if op_dist_attr.process_mesh == while_block_info[
+                                    block_idx]["actual_process_mesh"]:
+                                dims_mapping = op_dist_attr.get_output_dims_mapping(
+                                    var_name)
+                                op_dist_attr.set_output_dims_mapping(
+                                    var_reshard_mapping[var_name], dims_mapping)
+                                op_dist_attr.set_output_dist_attr(var_name,
+                                                                  None)
+
+        idx = 0
+        while idx < len(block.ops):
+            pre_op_count = len(block.ops)
+            op = block.ops[idx]
+
+            if _is_special_op(op):
+                idx += 1
+                continue
 
-        if _is_special_op(op):
-            idx += 1
-            continue
+            dist_op = dist_context.get_dist_op_for_program(op)
+            if dist_op is not None:
+                process_meshes = []
+                if op.type == "while":
+                    if not _is_condition_replicative(
+                            op, auto_parallel_main_prog, dist_context):
+                        raise ValueError(
+                            "Please check the condition due to the dims mapping is not replicative."
+                        )
+                    process_meshes = _get_process_meshes(
+                        op, auto_parallel_main_prog, dist_context)
+                    assert process_meshes
+                    if op.attr("sub_block").id not in while_block_info:
+                        while_block_info[op.attr("sub_block").id] = {}
+                    while_block_info[op.attr("sub_block").id][
+                        "op_id"] = op.desc.id()
+                    while_block_info[op.attr("sub_block").id][
+                        "actual_process_mesh"] = _get_while_op_actual_process_mesh(
+                            op, auto_parallel_main_prog, rank_id, dist_context)
+                else:
+                    process_meshes = _get_op_process_meshes(op, dist_context)
+                input_vars = None
+                if op.type == "while":
+                    input_var_names = op.input("X")
+                else:
+                    input_var_names = op.input_arg_names
+                idx_offset = 0
+                for var_name in op.input_arg_names:
+                    # skip lod_tensor_blocking_queue_0
+                    if var_name == "lod_tensor_blocking_queue_0":
+                        continue
+                    var = _get_var(var_name, block, auto_parallel_main_prog)
+                    dist_tensor = dist_context.get_dist_tensor_for_program(var)
+                    for process_mesh in process_meshes:
+                        if dist_tensor is not None and _need_reshard(
+                                dist_tensor, dist_op, process_mesh,
+                                auto_parallel_main_prog, dist_context):
+                            reshard_op_desc = find_op_desc_seq(
+                                dist_tensor, dist_op, process_mesh, batch_size)
+                            parse_op_desc(block, rank_id, reshard_op_desc,
+                                          var_name, op, dist_context,
+                                          auto_parallel_main_prog, process_mesh)
+                            cur_op_count = len(block.ops)
+                            idx_offset = idx_offset + cur_op_count - pre_op_count
+                            pre_op_count = cur_op_count
+                idx = idx + idx_offset + 1
+            else:
+                idx += 1
 
-        dist_op = dist_context.get_dist_op_for_program(op)
-        if dist_op is not None:
-            idx_offset = 0
-            for var_name in op.input_arg_names:
-                # skip lod_tensor_blocking_queue_0
-                if var_name == "lod_tensor_blocking_queue_0":
-                    continue
-                var = block.vars[var_name]
-                dist_tensor = dist_context.get_dist_tensor_for_program(var)
-                if dist_tensor is not None and _need_reshard(dist_tensor,
-                                                             dist_op):
-                    reshard_op_desc = find_op_desc_seq(dist_tensor, dist_op)
-                    parse_op_desc(auto_parallel_main_prog, rank_id,
-                                  reshard_op_desc, var_name, op, dist_context)
-                    cur_op_count = len(block.ops)
-                    idx_offset = idx_offset + cur_op_count - pre_op_count
-                    pre_op_count = cur_op_count
-            idx = idx + idx_offset + 1
-        else:
-            idx += 1
-
-    # insert send and recv op if output process mesh is different from tensor process mesh
-    idx = 0
-    skip_ops = ["create_py_reader", "create_double_buffer_reader", "read"]
-    skip_ops += _g_special_ops
-    while idx < len(block.ops):
-        pre_op_count = len(block.ops)
-        op = block.ops[idx]
-        dist_op = dist_context.get_dist_op_for_program(op)
-        if dist_op is not None and op.type not in skip_ops:
-            for var_name in op.output_arg_names:
-                var = block.vars[var_name]
-                dist_tensor = dist_context.get_dist_tensor_for_program(var)
-                if dist_tensor is not None and _need_reshard(dist_tensor,
-                                                             dist_op, False):
-                    for index, item in enumerate(
-                            dist_op.dist_attr.process_mesh.processes):
-                        recv_rank = dist_tensor.dist_attr.process_mesh.processes[
-                            index]
-                        if rank_id == item:
-                            _insert_send_op(block, idx + 1, var, recv_rank)
-                        if rank_id == recv_rank:
-                            _insert_recv_op(block, idx + 1, var, item)
-                    cur_op_count = len(block.ops)
-                    idx_offset = idx_offset + cur_op_count - pre_op_count
-                    pre_op_count = cur_op_count
-            idx = idx + idx_offset + 1
-        else:
-            idx += 1
+        # insert send and recv op if output process mesh is different from tensor process mesh
+        idx = 0
+        # skip reader and ops whose process mesh is union
+        skip_ops = [
+            "create_py_reader", "create_double_buffer_reader", "read", "while",
+            "write_to_array", "read_from_array"
+        ]
+        skip_ops += _g_special_ops
+        while idx < len(block.ops):
+            pre_op_count = len(block.ops)
+            op = block.ops[idx]
+            dist_op = dist_context.get_dist_op_for_program(op)
+            if dist_op is not None and op.type not in skip_ops:
+                for var_name in op.output_arg_names:
+                    var = _get_var(var_name, block, auto_parallel_main_prog)
+                    dist_tensor = dist_context.get_dist_tensor_for_program(var)
+                    process_mesh = dist_op.dist_attr.process_mesh
+                    if dist_tensor is not None and _need_reshard(
+                            dist_tensor, dist_op, process_mesh,
+                            auto_parallel_main_prog, dist_context, False):
+                        for index, item in enumerate(
+                                dist_op.dist_attr.process_mesh.processes):
+                            recv_rank = dist_tensor.dist_attr.process_mesh.processes[
+                                index]
+                            if rank_id == item:
+                                _insert_send_op(block, idx + 1, var, recv_rank,
+                                                op.attr('op_role'))
+                            if rank_id == recv_rank:
+                                _insert_recv_op(block, idx + 1, var, item,
+                                                op.attr('op_role'))
+                        cur_op_count = len(block.ops)
+                        idx_offset = idx_offset + cur_op_count - pre_op_count
+                        pre_op_count = cur_op_count
+                idx = idx + idx_offset + 1
+            else:
+                idx += 1
 
     # remove no need vars and ops in the main program
     remove_no_need_in_main(auto_parallel_main_prog, dist_context, rank_id,
diff --git a/python/paddle/distributed/auto_parallel/tuner/__init__.py b/python/paddle/distributed/auto_parallel/tuner/__init__.py
new file mode 100644
index 0000000000000..513558501a0eb
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/auto_parallel/tuner/recorder.py b/python/paddle/distributed/auto_parallel/tuner/recorder.py
new file mode 100644
index 0000000000000..140336566a146
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/recorder.py
@@ -0,0 +1,214 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+class MetricRecord(object):
+    """
+    One record for a single metric at a given execution step.
+    """
+
+    def __init__(self, value, step):
+        self._value = value
+        self._step = step
+
+    @property
+    def value(self):
+        return self._value
+
+    @value.setter
+    def value(self, value):
+        self._value = value
+
+    @property
+    def step(self):
+        return self._step
+
+    @step.setter
+    def step(self, step):
+        self._step = step
+
+    def mean(self):
+        return np.mean(self.value)
+
+    def get_state(self):
+        return {"value": self.value, "step": self.step}
+
+    @classmethod
+    def from_state(cls, state):
+        return cls(**state)
+
+    def __eq__(self, other):
+        if not isinstance(other, MetricRecord):
+            return False
+        return other.value == self.value and other.step == self.step
+
+    def __repr__(self):
+        return "MetricRecord(value={}, step={})".format(self.value, self.step)
+
+
+class MetricRecords(object):
+    """
+    Records of a single metric across different executions.
+    """
+
+    def __init__(self, direction="min"):
+        if direction not in {"min", "max"}:
+            raise ValueError(
+                "direction should be one of {min, max}, but got: {}.".format(
+                    direction))
+        self._direction = direction
+        self._records = {}
+
+    @property
+    def records(self):
+        return sorted(self._records.values(), key=lambda r: r.step)
+
+    @records.setter
+    def records(self, records):
+        for r in records:
+            self.update(r.value, step=r.step)
+
+    @property
+    def direction(self):
+        return self._direction
+
+    @direction.setter
+    def direction(self, direction):
+        self._direction = direction
+
+    def update(self, value, step=0):
+        if step in self._records:
+            self._records[step].set_value(value)
+        else:
+            self._records[step] = MetricRecord(value, step=step)
+
+    def get_best_value(self):
+        values = list(r.mean() for r in self._records.values())
+        if not values:
+            return None
+        if self._direction == "min":
+            return np.nanmin(values)
+        return np.nanmax(values)
+
+    def get_best_step(self):
+        best_value = self.get_best_value()
+        if best_value is None:
+            return None
+        for r in self._records.values():
+            if r.mean() == best_value:
+                return r.step
+
+    def get_statistics(self):
+        records = self.records
+        records_values = [r.mean() for r in records]
+        if not len(records_values):
+            return {}
+        return {
+            "min": float(np.nanmin(records_values)),
+            "max": float(np.nanmax(records_values)),
+            "mean": float(np.nanmean(records_values)),
+            "median": float(np.nanmedian(records_values)),
+            "var": float(np.nanvar(records_values)),
+            "std": float(np.nanstd(records_values)),
+        }
+
+    def get_state(self):
+        state = {}
+        state["direction"] = self._direction
+        state["records"] = [r.get_state() for r in self.records]
+        return state
+
+    @classmethod
+    def from_state(cls, state):
+        records = cls(state["direction"])
+        records.records = [MetricRecord.from_state(r) for r in state["records"]]
+        print("here 1", records.records)
+        return records
+
+
+class MetricsRecorder(object):
+    """
+    Record the values for all metrics.
+    """
+
+    def __init__(self, metrics=None):
+        self._records = {}
+        self.register_metrics(metrics)
+
+    @property
+    def records(self):
+        return self._records
+
+    def exists(self, name):
+        return name in self._records
+
+    def register_metrics(self, metrics=None):
+        metrics = metrics or []
+        for metric in metrics:
+            self.register(metric.name)
+
+    def register(self, name, direction=None):
+        if self.exists(name):
+            raise ValueError("Metric {} have been registered.".format(name))
+        if direction is None:
+            direction = "min"
+        self._records[name] = MetricRecords(direction)
+
+    def update(self, name, value, step=0):
+        value = float(value)
+        if not self.exists(name):
+            self.register(name)
+
+        prev_best = self._records[name].get_best_value()
+        self._records[name].update(value, step=step)
+        new_best = self._records[name].get_best_value()
+
+        improved = new_best != prev_best
+        return improved
+
+    def get_records(self, name):
+        return self._records[name].records
+
+    def set_records(self, name, records):
+        if not self.exists(name):
+            self.register(name)
+        self._records[name].records = records
+
+    def get_best_value(self, name):
+        return self._records[name].get_best_value()
+
+    def get_best_step(self, name):
+        return self._records[name].get_best_step()
+
+    def get_statistics(self, name):
+        return self._records[name].get_statistics()
+
+    def get_state(self):
+        return {
+            "metrics": {
+                name: metric_records.get_state()
+                for name, metric_records in self._records.items()
+            }
+        }
+
+    @classmethod
+    def from_state(cls, state):
+        recorder = cls()
+        recorder._records = {
+            name: MetricRecords.from_state(metric_records)
+            for name, metric_records in state["metrics"].items()
+        }
+        return recorder
diff --git a/python/paddle/distributed/auto_parallel/tuner/storable.py b/python/paddle/distributed/auto_parallel/tuner/storable.py
new file mode 100644
index 0000000000000..d61e53a027240
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/storable.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+
+class Storable(object):
+    def get_state(self):
+        raise NotImplementedError
+
+    def set_state(self, state):
+        raise NotImplementedError
+
+    def save(self, path):
+        state = self.get_state()
+        state_json = json.dumps(state)
+        with open(path, "w") as f:
+            f.write(state_json)
+        return str(path)
+
+    def load(self, path):
+        with open(path, "r") as f:
+            state_data = f.read()
+        state = json.loads(state_data)
+        self.set_state(state)
diff --git a/python/paddle/distributed/auto_parallel/tuner/trial.py b/python/paddle/distributed/auto_parallel/tuner/trial.py
new file mode 100644
index 0000000000000..22a6638c5ca63
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/trial.py
@@ -0,0 +1,114 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import hashlib
+import random
+import time
+from enum import Enum
+
+from .storable import Storable
+from .recorder import MetricsRecorder
+from .tunable_space import TunableSpace
+
+
+class TrialStatus:
+    RUNNING = "RUNNING"
+    COMPLETED = "COMPLETED"
+    STOPPED = "STOPPED"
+    INVALID = "INVALID"
+
+
+class Trial(Storable):
+    def __init__(self, tunable_space, trial_id=None,
+                 status=TrialStatus.RUNNING):
+        self._id = _generate_trial_id() if trial_id is None else trial_id
+        self._space = tunable_space
+        self._recorder = MetricsRecorder()
+        self._score = None
+        self._best_step = None
+        self._status = status
+
+    @property
+    def id(self):
+        return self._id
+
+    @property
+    def space(self):
+        return self._space
+
+    @property
+    def recorder(self):
+        return self._recorder
+
+    @property
+    def score(self):
+        return self._score
+
+    @score.setter
+    def score(self, score):
+        self._score = score
+
+    @property
+    def best_step(self):
+        return self._best_step
+
+    @best_step.setter
+    def best_step(self, best_step):
+        self._best_step = best_step
+
+    @property
+    def status(self):
+        return self._status
+
+    @status.setter
+    def status(self, status):
+        self._status = status
+
+    def summary(self):
+        print("Tunable space:")
+        if self.space.values:
+            for tv, value in self.space.values.items():
+                print(tv + ":", value)
+
+        if self.score is not None:
+            print("Score: {}".format(self.score))
+
+    def get_state(self):
+        return {
+            "id": self.id,
+            "space": self.space.get_state(),
+            "recorder": self.recorder.get_state(),
+            "score": self.score,
+            "best_step": self.best_step,
+            "status": self.status,
+        }
+
+    def set_state(self, state):
+        self._id = state["id"]
+        self._space = TunableSpace.from_state(state["space"])
+        self._recorder = MetricsRecorder.from_state(state["recorder"])
+        self._score = state["score"]
+        self._best_step = state["best_step"]
+        self._status = state["status"]
+
+    @classmethod
+    def from_state(cls, state):
+        trial = cls(tunable_space=None)
+        trial.set_state(state)
+        return trial
+
+
+def _generate_trial_id():
+    s = str(time.time()) + str(random.randint(1, int(1e7)))
+    return hashlib.sha256(s.encode("utf-8")).hexdigest()[:32]
diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_space.py b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
new file mode 100644
index 0000000000000..f63364c5b75ef
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
@@ -0,0 +1,151 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import contextlib
+import copy
+import math
+import random
+import numpy as np
+
+from .tunable_variable import Boolean
+from .tunable_variable import Fixed
+from .tunable_variable import Choice
+from .tunable_variable import IntRange
+from .tunable_variable import FloatRange
+
+
+class TunableSpace(object):
+    """
+    A TunableSpace is constructed by the tunable variables.
+    """
+
+    def __init__(self):
+        # Tunable variables for this tunable variables
+        self._variables = {}
+        # Specific values coresponding to each tunable variable
+        self._values = {}
+
+    @property
+    def variables(self):
+        return self._variables
+
+    @property
+    def values(self):
+        return self._values
+
+    def get_value(self, name):
+        if name in self.values:
+            return self.values[name]
+        else:
+            raise KeyError("{} does not exist.".format(name))
+
+    def set_value(self, name, value):
+        if name in self.values:
+            self.values[name] = value
+        else:
+            raise KeyError("{} does not exist.".format(name))
+
+    def _exists(self, name):
+        if name in self._variables:
+            return True
+        return False
+
+    def _retrieve(self, tv):
+        tv = tv.__class__.from_state(tv.get_state())
+        if self._exists(tv.name):
+            return self.get_value(tv.name)
+        return self._register(tv)
+
+    def _register(self, tv):
+        self._variables[tv.name] = tv
+        if tv.name not in self.values:
+            self.values[tv.name] = tv.default
+        return self.values[tv.name]
+
+    def __getitem__(self, name):
+        return self.get_value(name)
+
+    def __setitem__(self, name, value):
+        self.set_value(name, value)
+
+    def __contains__(self, name):
+        try:
+            self.get_value(name)
+            return True
+        except (KeyError, ValueError):
+            return False
+
+    def fixed(self, name, default):
+        tv = Fixed(name=name, default=default)
+        return self._retrieve(tv)
+
+    def boolean(self, name, default=False):
+        tv = Boolean(name=name, default=default)
+        return self._retrieve(tv)
+
+    def choice(self, name, values, default=None):
+        tv = Choice(name=name, values=values, default=default)
+        return self._retrieve(tv)
+
+    def int_range(self, name, start, stop, step=1, default=None):
+        tv = IntRange(
+            name=name, start=start, stop=stop, step=step, default=default)
+        return self._retrieve(tv)
+
+    def float_range(self, name, start, stop, step=None, default=None):
+        tv = FloatRange(
+            name=name, start=start, stop=stop, step=step, default=default)
+        return self._retrieve(tv)
+
+    def get_state(self):
+        return {
+            "variables": [{
+                "class_name": v.__class__.__name__,
+                "state": v.get_state()
+            } for v in self._variables.values()],
+            "values": dict((k, v) for (k, v) in self.values.items())
+        }
+
+    @classmethod
+    def from_state(cls, state):
+        ts = cls()
+        for v in state["variables"]:
+            v = _deserialize_tunable_variable(v)
+            ts._variables[v.name] = v
+        ts._values = dict((k, v) for (k, v) in state["values"].items())
+        return ts
+
+
+def _deserialize_tunable_variable(state):
+    classes = (Boolean, Fixed, Choice, IntRange, FloatRange)
+    cls_name_to_cls = {cls.__name__: cls for cls in classes}
+
+    if isinstance(state, classes):
+        return state
+
+    if (not isinstance(state, dict) or "class_name" not in state or
+            "state" not in state):
+        raise ValueError(
+            "Expect state to be a python dict containing class_name and state as keys, but found {}"
+            .format(state))
+
+    cls_name = state["class_name"]
+    cls = cls_name_to_cls[cls_name]
+    if cls is None:
+        raise ValueError("Unknown class name {}".format(cls_name))
+
+    cls_state = state["state"]
+    deserialized_object = cls.from_state(cls_state)
+    return deserialized_object
diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py b/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py
new file mode 100644
index 0000000000000..9549b44c48ecb
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py
@@ -0,0 +1,242 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+class TunableVariable(object):
+    """
+    Tunablevariable base class.
+    """
+
+    def __init__(self, name, default=None):
+        self.name = name
+        self._default = default
+
+    @property
+    def default(self):
+        return self._default
+
+    def get_state(self):
+        return {"name": self.name, "default": self.default}
+
+    @classmethod
+    def from_state(cls, state):
+        return cls(**state)
+
+
+class Fixed(TunableVariable):
+    """
+    Fixed variable which cannot be changed.
+    """
+
+    def __init__(self, name, default):
+        super(Fixed, self).__init__(name=name, default=default)
+        self.name = name
+        if not isinstance(default, (str, int, float, bool)):
+            raise ValueError(
+                "Fixed must be an str, int, float or bool, but found {}"
+                .format(default))
+        self._default = default
+
+    def random(self, seed=None):
+        return self._default
+
+    def __repr__(self):
+        return "Fixed(name: {}, value: {})".format(self.name, self.default)
+
+
+class Boolean(TunableVariable):
+    """
+    Choice between True and False.
+    """
+
+    def __init__(self, name, default=False):
+        super(Boolean, self).__init__(name=name, default=default)
+        if default not in {True, False}:
+            raise ValueError(
+                "default must be a Python boolean, but got {}".format(default))
+
+    def random(self, seed=None):
+        rng = np.random.default_rng(seed)
+        return rng.choice((True, False))
+
+    def __repr__(self):
+        return 'Boolean(name: "{}", default: {})'.format(self.name,
+                                                         self.default)
+
+
+class Choice(TunableVariable):
+    def __init__(self, name, values, default=None):
+        super(Choice, self).__init__(name=name, default=default)
+
+        types = set(type(v) for v in values)
+        if len(types) > 1:
+            raise TypeError(
+                "Choice can contain only one type of value, but found values: {} with types: {}."
+                .format(str(values), str(types)))
+
+        if isinstance(values[0], str):
+            values = [str(v) for v in values]
+            if default is not None:
+                default = str(default)
+        elif isinstance(values[0], int):
+            values = [int(v) for v in values]
+            if default is not None:
+                default = int(default)
+        elif isinstance(values[0], float):
+            values = [float(v) for v in values]
+            if default is not None:
+                default = float(default)
+        elif isinstance(values[0], bool):
+            values = [bool(v) for v in values]
+            if default is not None:
+                default = bool(default)
+        else:
+            raise TypeError(
+                "Choice can only contain str, int, float, or boll, but found: {} "
+                .format(str(values)))
+        self.values = values
+
+        if default is not None and default not in values:
+            raise ValueError(
+                "The default value should be one of the choices {}, but found {}".
+                format(values, default))
+        self._default = default
+
+    @property
+    def default(self):
+        if self._default is None:
+            if None in self.values:
+                return None
+            return self.values[0]
+        return self._default
+
+    def random(self, seed=None):
+        rng = np.random.default_rng(seed)
+        return rng.choice(self.values)
+
+    def get_state(self):
+        state = super(Choice, self).get_state()
+        state["values"] = self.values
+        return state
+
+    def __repr__(self):
+        return 'Choice(name: "{}", values: {}, default: {})'.format(
+            self.name, self.values, self.default)
+
+
+class IntRange(TunableVariable):
+    """
+    Integer range.
+    """
+
+    def __init__(self, name, start, stop, step=1, default=None, endpoint=False):
+        super(IntRange, self).__init__(name=name, default=default)
+        self.start = self._check_int(start)
+        self.stop = self._check_int(stop)
+        self.step = self._check_int(step)
+        self._default = default
+        self.endpoint = endpoint
+
+    @property
+    def default(self):
+        if self._default is not None:
+            return self._default
+        return self.start
+
+    def random(self, seed=None):
+        rng = np.random.default_rng(seed)
+        value = (self.stop - self.start) * rng.random() + self.start
+        if self.step is not None:
+            if self.endpoint:
+                values = np.arange(self.start, self.stop + 1e-7, step=self.step)
+            else:
+                values = np.arange(self.start, self.stop, step=self.step)
+            closest_index = np.abs(values - value).argmin()
+            value = values[closest_index]
+        return int(value)
+
+    def get_state(self):
+        state = super(IntRange, self).get_state()
+        state["start"] = self.start
+        state["stop"] = self.stop
+        state["step"] = self.step
+        state["default"] = self._default
+        return state
+
+    def _check_int(self, val):
+        int_val = int(val)
+        if int_val != val:
+            raise ValueError("Expects val is an int, but found: {}.".format(
+                str(val)))
+        return int_val
+
+    def __repr__(self):
+        return "IntRange(name: {}, start: {}, stop: {}, step: {}, default: {})".format(
+            self.name, self.start, self.stop, self.step, self.default)
+
+
+class FloatRange(TunableVariable):
+    """
+    Float range.
+    """
+
+    def __init__(self,
+                 name,
+                 start,
+                 stop,
+                 step=None,
+                 default=None,
+                 endpoint=False):
+        super(FloatRange, self).__init__(name=name, default=default)
+        self.stop = float(stop)
+        self.start = float(start)
+        if step is not None:
+            self.step = float(step)
+        else:
+            self.step = None
+        self._default = default
+        self.endpoint = endpoint
+
+    @property
+    def default(self):
+        if self._default is not None:
+            return self._default
+        return self.start
+
+    def random(self, seed=None):
+        rng = np.random.default_rng(seed)
+        value = (self.stop - self.start) * rng.random() + self.start
+        if self.step is not None:
+            if self.endpoint:
+                values = np.arange(self.start, self.stop + 1e-7, step=self.step)
+            else:
+                values = np.arange(self.start, self.stop, step=self.step)
+            closest_index = np.abs(values - value).argmin()
+            value = values[closest_index]
+        return value
+
+    def get_state(self):
+        state = super(FloatRange, self).get_state()
+        state["start"] = self.start
+        state["stop"] = self.stop
+        state["step"] = self.step
+        state["endpoint"] = self.endpoint
+        return state
+
+    def __repr__(self):
+        return "FloatRange(name: {}, start: {}, stop: {}, step: {}, default: {}, endpoint: {})".format(
+            self.name, self.start, self.stop, self.step, self.default,
+            self.endpoint)
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index 241eadcbace22..86c274cb45cc3 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -1271,7 +1271,6 @@ def get_all_distributed_main_program(serial_program_info, dist_context,
         used_dist_context._dist_op_context = DistributedOperatorContext()
         _, _, dist_startup_program, dist_main_program, _ = copied_parallelizer._get_dist_program(
             rank_id, used_dist_context)
-        # print("dist_main_program: ", dist_main_program)
         all_dist_main_program.append(dist_main_program)
 
     return all_dist_main_program
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index bf6556d21e9fc..fde3805914d80 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -14,6 +14,7 @@
 
 import numpy as np
 import os
+from datetime import timedelta
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.framework import Variable
 from ..fluid.framework import OpProtoHolder
@@ -73,6 +74,7 @@ class ReduceOp:
     MAX = 1
     MIN = 2
     PROD = 3
+    AVG = 4
 
 
 class Group():
@@ -80,11 +82,13 @@ class Group():
     The abstract representation of group.
     """
 
-    def __init__(self, rank, rank_num, id=0, ranks=[]):
+    def __init__(self, rank, rank_num, id=0, ranks=[], pg=None, name=None):
         self.rank = rank
         self.nranks = rank_num
         self.id = id
         self.ranks = ranks
+        self.pg = pg
+        self.name = name
 
     def is_member(self):
         if self.rank < 0:
@@ -99,11 +103,16 @@ def get_group_rank(self, rank):
         else:
             return -1
 
+    @property
+    def process_group(self):
+        return self.pg
+
     def __repr__(self):
         debug_str = "rank: {}, nranks: {}, id: {}, ranks: ".format(
             self.rank, self.nranks, self.id)
         debug_str += ", ".join(map(str, self.ranks))
-        debug_str += ". "
+        debug_str += "; name: "
+        debug_str += self.name if self.name else "None"
         return debug_str
 
 
@@ -121,6 +130,17 @@ def _get_global_env():
 # Dict[int, Group]
 _group_map = {}
 
+# group map by name : the map of all groups from their names
+# Dict[name, Group]
+_group_map_by_name = {}
+
+# Name of the default group for init_parallel_env
+_default_group_name = "_default_pg"
+
+_valid_backend_list = ['nccl', 'gloo', 'hccl']
+_default_store = None  # the default tcp store
+_default_backend = None
+
 
 def _get_group_map():
     global _group_map
@@ -135,10 +155,29 @@ def _get_global_group():
     return _get_group_map()[0]
 
 
+def _get_group_map_by_name():
+    global _group_map_by_name
+    assert _default_group_name in _group_map_by_name, (
+        "Call paddle.distributed.init_parallel_env first "
+        "to initialize the distributed environment.")
+    return _group_map_by_name
+
+
+def _get_default_group():
+    assert _default_group_name in _group_map_by_name, (
+        "Call paddle.distributed.init_parallel_env first "
+        "to initialize the distributed environment.")
+    return _get_group_map_by_name()[_default_group_name]
+
+
 def _new_ring_id():
     return len(_get_group_map()) + max(_get_global_env().nrings, 9)
 
 
+def _new_group_name_id():
+    return len(_get_group_map_by_name()) + max(_get_global_env().nrings, 9)
+
+
 def get_group(id=0):
     """
 
@@ -163,6 +202,194 @@ def get_group(id=0):
     return gm[id] if id in gm else None
 
 
+def _new_process_group_impl(backend, store, rank, world_size, group_name,
+                            pg_options):
+    if backend == "gloo":
+        gloo_store = core.GlooStore(store)
+
+    pg = None
+    if backend == "gloo":
+        pg = core.ProcessGroupGloo(gloo_store, rank, world_size)
+    elif backend == "nccl":
+        pg = core.ProcessGroupNCCL(store, rank, world_size)
+    elif backend == "hccl":
+        pg = core.ProcessGroupHCCL(store, rank, world_size)
+
+    return pg
+
+
+def _init_parallel_env(rank=None,
+                       world_size=None,
+                       backend="nccl",
+                       timeout=timedelta(0),
+                       pg_options=None):
+    """
+
+    Initializes the default distributed environment.
+    
+    Args:
+        rank (int, optional): the rank of the current process or device from 0 to world_size (exclusive).
+            If you launch your training with paddle.distributed.run or 
+            paddle.distributed.launch module, None can be given. Default: None.
+        world_size (int, optional): total number of processes or devices.
+            If you launch your training with paddle.distributed.run or 
+            paddle.distributed.launch module, None can be given. Default: None.
+        backend (str, optional): the name of the backend used to initialize
+            the distributed environment. The value can be one of 'nccl' for
+            GPU, 'gloo' for CPU or 'hccl' for NPU. Default: 'nccl'.
+        timeout (datetime.timedelta, optional): timeout used for operations of
+            the group. Default: datetime.timedelta(0) which means no timeout.
+        pg_options (dict, optional): options for the group. Default: None.
+
+    Returns:
+        Group: a group.
+
+    Examples:
+
+        .. code-block:: python
+
+            # filename: train.py
+            import paddle
+            paddle.distributed.init_parallel_env(0, 1)
+            
+            # how to start
+            # python paddle.distributed.run --gpus="0,1" train.py
+
+    """
+
+    global _group_map_by_name
+    global _default_group_name
+    assert _default_group_name not in _group_map_by_name, (
+        "The default distributed environment has been initialized.")
+
+    assert backend in _valid_backend_list, (
+        "Backend must be one of {}, but the given one is: {}".format(
+            _valid_backend_list, backend))
+    _default_backend = backend
+
+    assert isinstance(timeout, timedelta), (
+        "timeout must be of the type datetime.timedelta.")
+
+    if rank is None or world_size is None:
+        assert rank is None and world_size is None, (
+            "rank and world_size should be unset at the same time.")
+        trainer_id = os.getenv("PADDLE_TRAINER_ID", None)
+        trainer_num = os.getenv("PADDLE_TRAINERS_NUM", None)
+        if trainer_id is None or trainer_num is None:
+            warnings.warn("If rank and world_size are both None, please start "
+                          "your training with paddle.distributed.run or "
+                          "paddle.distributed.launch module. Otherwise, "
+                          "init_parallel_env will do nothing.")
+            return None
+        rank = int(trainer_id)
+        world_size = int(trainer_num)
+
+    assert rank >= 0 and world_size > rank and world_size > 1, (
+        "rank must be non-negative and world_size must be the "
+        "maximum rank plus one. Moreover, at least two processes are "
+        "required to create a process group.")
+
+    master_addr = os.getenv("MASTER_ADDR", None)
+    master_port = os.getenv("MASTER_PORT", None)
+    if not master_addr or not master_port:
+        endpoints = os.getenv("PADDLE_MASTER", None)
+        if endpoints is None:
+            endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", None)
+        if not endpoints:
+            raise ValueError(
+                "The environment variable 'MASTER_ADDR' and 'MASTER_PORT' "
+                "must be specified, for example 'export MASTER_ADDR=127.0.0.1' "
+                "and 'export MASTER_ADDR=54612'. Or you can start your training"
+                "with paddle.distributed.run or "
+                "paddle.distributed.luanch module.")
+        if ',' in endpoints:
+            endpoints = endpoints.split(',')[0]
+        master_addr, master_port = endpoints.split(":")
+
+    master_port = int(master_port)
+
+    is_master = rank == 0
+    global _default_store
+    _default_store = core.TCPStore(master_addr, master_port, is_master,
+                                   world_size, timeout)
+
+    pg = _new_process_group_impl(backend, _default_store, rank, world_size,
+                                 _default_group_name, pg_options)
+    ranks = list(range(world_size))
+    group = Group(
+        rank, world_size, id=0, ranks=ranks, pg=pg, name=_default_group_name)
+
+    paddle.fluid.dygraph.parallel_helper._set_parallel_ctx(True)
+    _group_map_by_name[_default_group_name] = group
+    return group
+
+
+def _new_group(ranks=None,
+               backend=None,
+               group_name=None,
+               timeout=timedelta(0),
+               pg_options=None):
+    """
+    Create a new process group.
+
+    Args:
+        ranks (list, optional): list of ranks for the new group. If None is given, 
+            all processes is used. Default: None.
+        backend (str, optional): the name of the backend used to initialize
+            the distributed environment. Default: the one for init_parallel_env.
+        timeout (datetime.timedelta, optional): timeout used for operations of
+            the group. Default: datetime.timedelta(0).
+        pg_options (dict, optional): options for the group. Default: None.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.distributed.init_parallel_env(0, 1)
+            paddle.distributed.new_group([0, 1])
+
+            # how to start
+            # python paddle.distributed.run --gpus="0,1" train.py
+
+    """
+    global _default_group_name
+    if group_name is None:
+        group_name = _default_group_name + str(_new_group_name_id())
+    if group_name == _default_group_name:
+        raise ValueError("group_name must be specified and it cannot be '{}' "
+                         "which is used for the default process group created "
+                         "by init_parallel_env.".format(_default_group_name))
+    global_group = _get_default_group()
+    global_rank = global_group.rank
+    global_ranks = global_group.ranks
+    if ranks is None:
+        ranks = global_ranks
+    assert len(ranks) <= len(global_ranks), (
+        "Size of new group must be less than or "
+        "equal to that of the default global group.")
+    size = len(ranks)
+    assert size > 1, "A group must have at least two memebers."
+    ranks = sorted(ranks)
+    if global_rank in ranks:
+        rank = ranks.index(global_rank)
+        pg = _new_process_group_impl(backend, _default_store, rank, size,
+                                     group_name, pg_options)
+    else:
+        rank = -1
+        pg = None
+    group = Group(
+        rank,
+        size,
+        id=_new_group_name_id(),
+        ranks=ranks,
+        pg=pg,
+        name=group_name)
+    _group_map_by_name[group_name] = group
+
+    return group
+
+
 def barrier(group=None):
     """
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
index a31f8bbfed0c9..a2c741667ed77 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -25,10 +25,9 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
-import paddle.distributed as dist
 from paddle.optimizer import Optimizer
 from paddle.fluid.clip import ClipGradByGlobalNorm
-from paddle.distributed.collective import _get_global_group
+from paddle.distributed.collective import _get_global_group, new_group, broadcast, wait
 
 from ...utils.internal_storage import ParamStorage, GradStorage
 from ...meta_parallel.sharding.sharding_utils import Type, device_guard, ShardingClipGrad
@@ -91,8 +90,8 @@ def __init__(self,
                 filter(lambda x: x.trainable and x.dtype == Type.fp16.value,
                        self._local_params))) > 0
 
-        self.group = dist.new_group(_get_global_group()
-                                    .ranks) if group is None else group
+        self.group = new_group(_get_global_group()
+                               .ranks) if group is None else group
 
         self.world_size = self.group.nranks
         self.rank = self.group.rank
@@ -141,14 +140,14 @@ def _sync_params_and_buffers(self):
         """
 
         for p in self._local_params:
-            dist.broadcast(
+            broadcast(
                 p,
                 src=self._global_root_rank,
                 group=self.group,
                 use_calc_stream=True)
 
         # Multi stream operation will be supported later
-        dist.wait(tensor=p, group=self.group, use_calc_stream=True)
+        wait(tensor=p, group=self.group, use_calc_stream=True)
 
     def _generate_master_params(self, trainable_params):
         if self.offload:
@@ -385,6 +384,12 @@ def minimize(self):
         raise RuntimeError(
             "optimizer.minimize() not support now, please use optimizer.step()")
 
+    def set_state_dict(self, state_dict):
+        self._optim.set_state_dict(state_dict)
+
+    def state_dict(self):
+        return self._optim.state_dict()
+
     def _clear_cache(self):
         self.__segment_params.clear()
         self._dtype_rank_params.clear()
@@ -399,14 +404,14 @@ def _broadcast_params(self):
         # Exchange all the shards with the other ranks
         for dtype_per_rank in self.param_storages.values():
             for dst_rank, internal_storage in dtype_per_rank.items():
-                dist.broadcast(
+                broadcast(
                     tensor=internal_storage.buffer,
                     src=self.group.ranks[dst_rank],
                     group=self.group,
                     use_calc_stream=True)
 
             # Multi stream operation will be supported later
-            dist.wait(
+            wait(
                 tensor=internal_storage.buffer,
                 group=self.group,
                 use_calc_stream=True)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index b42f21989abd7..1a3a8a4883d8b 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -900,11 +900,12 @@ def save_persistables(exe, dirname, main_program, filename=None):
 
     def is_opt_vars(var):
         # NOTE(JZ-LIANG): The checks should be updated when add new compatible optimizer
-        # now only Momentum and adam are compatible with sharding
-        # support EMA optimizer
+        # now only Momentum and adam are compatible with sharding,
+        # support EMA optimizer with '_ema_0',
+        # support offload with '@offload_0' and '.cast_fp16'
         checks = [
             "_moment1_0", "_moment2_0", "_beta1_pow_acc_0", "_beta2_pow_acc_0",
-            "_velocity_0", "_ema_0"
+            "_velocity_0", "_ema_0", "@offload_0", ".cast_fp16"
         ]
         for check in checks:
             if var.name.endswith(check) and var.persistable:
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
index 548f036067eba..c6f05023e6138 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
@@ -28,7 +28,7 @@
 
 import paddle
 from paddle import nn
-import paddle.distributed as dist
+from paddle.distributed import collective as dist
 from paddle.distributed.collective import _get_global_group
 
 from ...utils.internal_storage import GradStorage
@@ -158,6 +158,17 @@ def forward(self, *inputs, **kwargs):
 
         return fw
 
+    def set_state_dict(self, state_dict, use_structured_name=True):
+        self._layer.set_state_dict(
+            state_dict, use_structured_name=use_structured_name)
+
+    def state_dict(self,
+                   destination=None,
+                   include_sublayers=True,
+                   structured_name_prefix=""):
+        return self._layer.state_dict(
+            destination=None, include_sublayers=True, structured_name_prefix="")
+
     def _clear_gradients(self):
         """
         Set zero to the gradient of the optimizer's current rank trainable parameters.
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
index bcf63a54cc4ec..f96273cc84caf 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -20,7 +20,6 @@
 import functools
 import numpy as np
 from itertools import chain
-from functools import reduce
 from types import MethodType
 from collections import deque, OrderedDict
 
@@ -28,9 +27,9 @@
 from paddle import nn
 from paddle.autograd import PyLayer
 import paddle.fluid.core as core
-import paddle.distributed as dist
 from paddle.fluid.framework import ParamBase
 from paddle.fluid.clip import ClipGradByGlobalNorm
+from paddle.distributed import collective as dist
 from paddle.distributed.collective import _get_global_group
 
 from .sharding_utils import Type, ShardingClipGrad, device_guard
@@ -249,6 +248,17 @@ def forward(self, *inputs, **kwargs):
 
         return fw
 
+    def set_state_dict(self, state_dict, use_structured_name=True):
+        self._layer.set_state_dict(
+            state_dict, use_structured_name=use_structured_name)
+
+    def state_dict(self,
+                   destination=None,
+                   include_sublayers=True,
+                   structured_name_prefix=""):
+        return self._layer.state_dict(
+            destination=None, include_sublayers=True, structured_name_prefix="")
+
     def _handle_unslice_params(self):
         buffer_size = dict()
         buffer_size[Type.fp32.value] = 0
@@ -523,7 +533,7 @@ def _register_backward_hooks(self):
 
     def _get_allreduce_fn(self, param):
         @paddle.autograd.no_grad()
-        def reduce(*_):
+        def allreduce_(*_):
             if param.name in self._task_flow.full_grad.keys():
                 full_grad = self._task_flow.full_grad[param.name]
                 # Only support sync allreduce current rank's layer now
@@ -573,7 +583,7 @@ def reduce(*_):
                     if self._offload:
                         param.fw_storage = _device2cpu(param.fw_storage, True)
 
-        return reduce
+        return allreduce_
 
     def _param2align(self, param):
         # CUDA alignment 256 bytes
@@ -902,7 +912,6 @@ def _device2cpu(trans_param, convert_dtype=False):
 
 def _cpu2device(param):
     tmp_p = param.fw_storage.cuda(DEV_ID)
-    param.fw_storage._clear()
     if tmp_p.dtype == Type.fp32.value and param2dtype[
             param.name] == Type.fp16.value:
         tmp_p = paddle.cast(tmp_p, Type.fp16.value)
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
index 0a42b993d5bf2..6a30276e02ba2 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -21,7 +21,6 @@
 from types import MethodType
 
 import paddle
-import paddle.distributed as dist
 from paddle import _C_ops
 from paddle.fluid import core
 from paddle.fluid import layers
@@ -90,7 +89,7 @@ def _dygraph_clip(self, params_grads):
             global_norm_fp16 = paddle.cast(
                 global_norm_fp16, dtype=paddle.float32)
 
-        # global norm of non-distributed FP16 params_and_grads for slice parameter
+        # global norm of non-distributed FP16 params_and_grads for unslice parameter
         if len(unslice_params_fp16) == 0:
             global_unslice_fp16 = paddle.to_tensor([0.], dtype=paddle.float32)
         else:
@@ -105,21 +104,20 @@ def _dygraph_clip(self, params_grads):
                 [0.], dtype=paddle.float32)
         global_norm_fp32 = layers.reduce_sum(global_norm_fp32)
 
-        # global norm of non-distributed FP32 params_and_grads for slice parameter
+        # global norm of non-distributed FP32 params_and_grads for unslice parameter
         global_unslice_fp32 = layers.concat(unslice_params_fp32) if len(
             unslice_params_fp32) != 0 else paddle.to_tensor(
                 [0.], dtype=paddle.float32)
         global_unslice_fp32 = layers.reduce_sum(global_unslice_fp32)
         global_unslice_var = global_unslice_fp16 + global_unslice_fp32
 
-        global_norm_var = global_norm_fp16 + global_norm_fp32
+        global_norm_var = global_norm_fp16 + global_norm_fp32 + 1.0 / self._group.nranks * global_unslice_var
 
         # add all reduce to get global norm of distributed params_and_grads
         dev_id = int(self._device.split(":")[1])
         with device_guard(dev_id, "gpu"):
             paddle.distributed.all_reduce(global_norm_var, group=self._group)
 
-        global_norm_var += global_unslice_var
         global_norm_var = layers.sqrt(global_norm_var)
         max_global_norm = layers.fill_constant(
             shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
diff --git a/python/paddle/distributed/launch/__init__.py b/python/paddle/distributed/launch/__init__.py
new file mode 100644
index 0000000000000..f39bb76114345
--- /dev/null
+++ b/python/paddle/distributed/launch/__init__.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = []
+'''
+Paddle distributed training entry ``python -m paddle.distributed.launch``.
+
+Help
+
+# for arg usage and explanation, try the following command
+# python -m paddle.distributed.launch -h
+
+Collective Mode
+
+Case 1: 1 node
+
+use all visible devices
+# python -m paddle.distributed.launch train.py
+
+use specified devices
+# python -m paddle.distributed.launch --devices=0,1,2,3 train.py
+
+Case 2: multi-node, auto detect ip/port
+
+# python -m paddle.distributed.launch --nnodes 2 train.py
+# auto print following command
+# python -m paddle.distributed.launch --master 10.0.0.1:13538 --nnodes 2 demo.py
+# then copy and paste above command to other nodes
+
+Case 3: multi-node, specified master/rendezvous server
+
+# python -m paddle.distributed.launch --nnodes 2 --master 10.0.0.1:2379 train.py
+# the master ip must be one of the node and the port must available
+
+Parameter Server Mode
+
+Case 1.1: 1 node, 1 ps, 1 trainer
+
+# python -m paddle.distributed.launch --mode ps train.py
+# python -m paddle.distributed.launch --server_num=1 --trainer_num=1 train.py
+
+Case 1.2: 1 node, 2 ps, 2 trainer
+
+# python -m paddle.distributed.launch --server_num=2 --trainer_num=2 train.py
+
+Case 2: 2 node, 2 ps, 2 trainer per node
+
+# python -m paddle.distributed.launch --server_num=2 --trainer_num=2 --nnodes 2 train.py
+# auto print following command
+# python -m paddle.distributed.launch --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --nnodes 2 train.py
+# then copy and paste above command to other nodes
+
+Case 3: multi-node, specified master/rendezvous server
+
+# python -m paddle.distributed.launch --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --nnodes 2 train.py
+# the master ip must be one of the node and the port must available
+
+Case 4: specified servers and trainers in each node
+
+python -m paddle.distributed.launch --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903 train.py
+
+
+Elastic Mode
+
+# run following command in 3 node to run immediately, or in 2 node to run after elastic_timeout
+# python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2:3 train.py
+
+# once the peer number changes between 2:3, the strategy holds
+
+'''
diff --git a/python/paddle/distributed/launch/__main__.py b/python/paddle/distributed/launch/__main__.py
new file mode 100644
index 0000000000000..9cd6f4408c989
--- /dev/null
+++ b/python/paddle/distributed/launch/__main__.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .context import Context
+from . import controllers
+
+
+def launch():
+    # initialize the context to run
+    ctx = Context()
+
+    if ctx.is_legacy_mode():
+
+        # legacy mode
+        from paddle.distributed.fleet import launch
+        launch.launch()
+
+    else:
+
+        # initialize the selected controller
+        c = controllers.init(ctx)
+
+        # run the pods
+        c.run()
+
+        # manager or just wait pod
+        c.finalize()
+
+
+if __name__ == "__main__":
+    launch()
diff --git a/python/paddle/distributed/launch/context/__init__.py b/python/paddle/distributed/launch/context/__init__.py
new file mode 100644
index 0000000000000..e13bb2a5f0ba7
--- /dev/null
+++ b/python/paddle/distributed/launch/context/__init__.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.distributed.launch import plugins
+
+from .node import Node
+from .status import Status
+from .args_envs import parse_args, fetch_envs, env_args_mapping
+
+import logging
+
+
+class Context(object):
+    def __init__(self, enable_plugin=True):
+        self.args, self.unknown_args = parse_args()
+        self.envs = fetch_envs()
+        self.logger = self.get_logger()
+
+        self.node = Node()
+        self.status = Status()
+
+        self.set_env_in_args()
+
+        # design for event queue, later
+        self.events = []
+
+        if enable_plugin:
+            self._enable_plugin()
+
+    def is_legacy_mode(self):
+        if self.args.legacy:
+            return True
+
+        if len(self.unknown_args) > 0:
+            self.logger.warning("Compatible mode enable with args {}".format(
+                self.unknown_args))
+            return True
+
+        legacy_env_list = [
+            'DISTRIBUTED_TRAINER_ENDPOINTS',
+            'PADDLE_ELASTIC_JOB_ID',
+            'PADDLE_DISTRI_BACKEND',
+            'FLAGS_START_PORT',
+        ]
+
+        for env in legacy_env_list:
+            if env in self.envs:
+                self.logger.warning(
+                    "ENV {} is deprecated, legacy launch enable".format(env))
+                return True
+
+        if self.args.master:
+            return False
+
+        return False
+
+    def get_envs(self):
+        return self.envs.copy()
+
+    def _enable_plugin(self):
+        for pl in plugins.enabled_plugins:
+            pl(self)
+
+    def get_logger(self, level=logging.INFO):
+        logger = logging.getLogger("LAUNCH")
+        logger.setLevel(self.args.log_level.upper() or level)
+        formatter = logging.Formatter(
+            fmt='%(name)s %(levelname)s %(asctime)s %(message)s')
+        ch = logging.StreamHandler()
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+        return logger
+
+    def set_env_in_args(self):
+        for k, v in env_args_mapping.items():
+            if k in self.envs:
+                setattr(self.args, v, self.envs[k])
diff --git a/python/paddle/distributed/launch/context/args_envs.py b/python/paddle/distributed/launch/context/args_envs.py
new file mode 100644
index 0000000000000..d504a11e5f3d1
--- /dev/null
+++ b/python/paddle/distributed/launch/context/args_envs.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from argparse import ArgumentParser, REMAINDER
+
+env_args_mapping = {
+    'POD_IP': 'host',
+    'PADDLE_MASTER': 'master',
+    'PADDLE_DEVICES': 'devices',
+    'PADDLE_NNODES': 'nnodes',
+    'PADDLE_MODE': 'mode',
+    'PADDLE_LOG_LEVEL': 'log_level',
+    'PADDLE_NPROC_PER_NODE': 'nproc_per_node',
+    'PADDLE_JOB_ID': 'job_id',
+    'PADDLE_RANK': 'rank',
+    'PADDLE_LOG_DIR': 'log_dir',
+    'PADDLE_MAX_RESTART': 'max_restart',
+    'PADDLE_ELASTIC_LEVEL': 'elastic_level',
+    'PADDLE_ELASTIC_TIMEOUT': 'elastic_timeout',
+    'PADDLE_SERVER_NUM': 'server_num',
+    'PADDLE_TRAINER_NUM': 'trainer_num',
+    'PADDLE_SERVERS_ENDPOINTS': 'servers',
+    'PADDLE_TRAINERS_ENDPOINTS': 'trainers',
+    'PADDLE_GLOO_PORT': 'gloo_port',
+    'PADDLE_WITH_GLOO': 'with_gloo',
+}
+
+
+def fetch_envs():
+    os.environ.pop('http_proxy', None)
+    os.environ.pop('https_proxy', None)
+
+    return os.environ.copy()
+
+
+def parse_args():
+    parser = ArgumentParser()
+
+    base_group = parser.add_argument_group("Base Parameters")
+
+    base_group.add_argument(
+        "--master",
+        type=str,
+        default=None,
+        help="the master/rendezvous server, ip:port")
+
+    base_group.add_argument(
+        "--legacy", type=bool, default=False, help="use legacy launch")
+
+    base_group.add_argument(
+        "--rank", type=int, default=-1, help="the peer rank")
+
+    base_group.add_argument(
+        "--log_level", type=str, default="INFO", help="log level. Default INFO")
+
+    base_group.add_argument(
+        "--nnodes",
+        type=str,
+        default="1",
+        help="the number of peers, i.e. pod/node number")
+
+    base_group.add_argument(
+        "--nproc_per_node",
+        type=int,
+        default=None,
+        help="the number of processes in a pod")
+
+    base_group.add_argument(
+        "--log_dir",
+        type=str,
+        default="log",
+        help="the path for each process's log. Default ./log")
+    base_group.add_argument(
+        "--mode",
+        type=str,
+        default="collective",
+        help="run mode of the job, collective/ps/ps-heter")
+
+    base_group.add_argument(
+        "--job_id",
+        type=str,
+        default="default",
+        help="unique id of the job. Default default")
+
+    base_group.add_argument(
+        "--devices",
+        type=str,
+        default=None,
+        help="accelerate devices. as --gpus,npus,xps")
+
+    base_group.add_argument("--host", type=str, default=None, help="host ip")
+
+    base_group.add_argument(
+        "training_script",
+        type=str,
+        help="the full path of py script,"
+        "followed by arguments for the "
+        "training script")
+
+    base_group.add_argument('training_script_args', nargs=REMAINDER)
+
+    ps_group = parser.add_argument_group("Parameter-Server Parameters")
+    # for parameter server
+    ps_group.add_argument(
+        "--servers", type=str, default='', help="servers endpoints full list")
+    ps_group.add_argument(
+        "--trainers", type=str, default='', help="trainers endpoints full list")
+
+    ps_group.add_argument(
+        "--trainer_num", type=int, default=None, help="number of trainers")
+    ps_group.add_argument(
+        "--server_num", type=int, default=None, help="number of servers")
+    ps_group.add_argument(
+        "--gloo_port", type=int, default=6767, help="gloo http port")
+    ps_group.add_argument(
+        "--with_gloo", type=str, default="0", help="use gloo or not")
+
+    # parameter elastic mode
+    elastic_group = parser.add_argument_group("Elastic Parameters")
+    elastic_group.add_argument(
+        "--max_restart",
+        type=int,
+        default=3,
+        help="the times can restart. Default 3")
+
+    elastic_group.add_argument(
+        "--elastic_level",
+        type=int,
+        default=-1,
+        help="elastic level: -1 disable, 0 failed exit, peers hold, 1 internal restart"
+    )
+
+    elastic_group.add_argument(
+        "--elastic_timeout",
+        type=int,
+        default=30,
+        help="seconds to wait before elastic perform training")
+
+    return parser.parse_known_args()
diff --git a/python/paddle/distributed/launch/context/device.py b/python/paddle/distributed/launch/context/device.py
new file mode 100644
index 0000000000000..9163e7abd9183
--- /dev/null
+++ b/python/paddle/distributed/launch/context/device.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+
+class DeviceType:
+    CPU = 'cpu'
+    GPU = 'gpu'
+    XPU = 'xpu'
+    NPU = 'npu'
+    MLU = 'mlu'
+
+
+class Device(object):
+    def __init__(self, dtype=None, memory="", labels=""):
+        self._dtype = dtype
+        self._memory = memory
+        self._labels = labels
+
+    def __str__(self):
+        return ",".join(self._labels)
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    @property
+    def count(self):
+        return len(self._labels) or 1
+
+    @property
+    def memory(self):
+        return self._memory
+
+    @property
+    def labels(self):
+        return self._labels
+
+    @labels.setter
+    def labels(self, lbs):
+        if isinstance(lbs, str):
+            self._labels = lbs.split(',')
+        elif isinstance(lbs, list):
+            self._labels = lbs
+        else:
+            self._labels = []
+
+    def get_selected_flag_key(self):
+        if self._dtype == DeviceType.CPU:
+            return 'FLAGS_selected_cpus'
+        if self._dtype == DeviceType.GPU:
+            return 'FLAGS_selected_gpus'
+        if self._dtype == DeviceType.NPU:
+            return 'FLAGS_selected_npus'
+        if self._dtype == DeviceType.XPU:
+            return 'FLAGS_selected_xpus'
+        if self._dtype == DeviceType.MLU:
+            return 'FLAGS_selected_mlus'
+        return 'FLAGS_selected_devices'
+
+    def get_selected_flag_label(self, idx):
+        if idx < len(self._labels):
+            return self._labels[idx]
+        else:
+            return '0'
+
+    def selected_flags(self, idx=None):
+        if idx is None:
+            return {self.get_selected_flag_key(): ','.join(self._labels)}
+        else:
+            return {
+                self.get_selected_flag_key(): self.get_selected_flag_label(idx)
+            }
+
+    @classmethod
+    def parse_device(self):
+        dev = Device()
+        visible_devices = None
+        if 'CUDA_VISIBLE_DEVICES' in os.environ or 'NVIDIA_VISIBLE_DEVICES' in os.environ:
+            dev._dtype = DeviceType.GPU
+            visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv(
+                "NVIDIA_VISIBLE_DEVICES")
+        elif 'XPU_VISIBLE_DEVICES' in os.environ:
+            dev._dtype = DeviceType.XPU
+            visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
+        elif 'ASCEND_VISIBLE_DEVICES' in os.environ:
+            dev._dtype = DeviceType.NPU
+            visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
+        elif 'MLU_VISIBLE_DEVICES' in os.environ:
+            dev._dtype = DeviceType.MLU
+            visible_devices = os.getenv("MLU_VISIBLE_DEVICES")
+
+        if visible_devices is not None and visible_devices != 'all':
+            dev._labels = visible_devices.split(',')
+        else:
+            return self.detect_device()
+
+        return dev
+
+    @classmethod
+    def detect_device(self):
+        import paddle.fluid as fluid
+
+        dev = Device()
+        num = 0
+        visible_devices = None
+        if fluid.core.is_compiled_with_cuda():
+            dev._dtype = DeviceType.GPU
+            num = fluid.core.get_cuda_device_count()
+            visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv(
+                "NVIDIA_VISIBLE_DEVICES")
+        elif fluid.core.is_compiled_with_xpu():
+            dev._dtype = DeviceType.XPU
+            num = fluid.core.get_xpu_device_count()
+            visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
+        elif fluid.core.is_compiled_with_npu():
+            dev._dtype = DeviceType.NPU
+            num = fluid.core.get_npu_device_count()
+            visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
+        elif fluid.core.is_compiled_with_mlu():
+            dev._dtype = DeviceType.MLU
+            num = fluid.core.get_mlu_device_count()
+            visible_devices = os.getenv("MLU_VISIBLE_DEVICES")
+
+        if num == 0:
+            dev._dtype = DeviceType.CPU
+        elif visible_devices is None or visible_devices == "all":
+            dev._labels = [str(x) for x in range(0, num)]
+        else:
+            dev._labels = visible_devices.split(',')
+
+        return dev
+
+
+if __name__ == '__main__':
+    d = Device.parse_device()
+    print(d.get_selected_flag())
diff --git a/python/paddle/distributed/launch/context/event.py b/python/paddle/distributed/launch/context/event.py
new file mode 100644
index 0000000000000..23e8e7a501400
--- /dev/null
+++ b/python/paddle/distributed/launch/context/event.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Event(object):
+    def __init__(self, kind="status", message="", fatal=False):
+        self.kind = kind
+        self.message = message
+        self.fatal = fatal
diff --git a/python/paddle/distributed/launch/context/node.py b/python/paddle/distributed/launch/context/node.py
new file mode 100644
index 0000000000000..1ece4db0fbbee
--- /dev/null
+++ b/python/paddle/distributed/launch/context/node.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .device import Device
+
+import socket
+import struct
+from contextlib import closing
+
+
+class Node(object):
+    def __init__(self):
+        # self.device = Device.detect_device()
+        self.device = Device.parse_device()
+        self.ip = self.get_host_ip()
+        self.free_ports = []
+
+    def get_host_ip(self):
+        try:
+            self.hostname = socket.gethostname()
+            self.ip = socket.gethostbyname(socket.getfqdn(self.hostname))
+            return self.ip
+        except:
+            return '127.0.0.1'
+
+    def get_free_ports(self, n=1):
+        free_ports = [self.get_free_port() for i in range(n)]
+        self.free_ports += free_ports
+        return free_ports
+
+    def get_ports_occupied(self):
+        return self.free_ports
+
+    @classmethod
+    def get_free_port(self):
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+            s.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER,
+                         struct.pack('ii', 1, 0))
+            s.bind(('', 0))
+            return s.getsockname()[1]
+
+    @classmethod
+    def is_server_ready(self, ip, port):
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+            #sock.settimeout(0.01)
+            #sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            if hasattr(socket, 'SO_REUSEPORT'):
+                sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
+            result = sock.connect_ex((ip, int(port)))
+            if result == 0:
+                return True
+            else:
+                return False
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch/context/resource.py
similarity index 79%
rename from python/paddle/distributed/launch.py
rename to python/paddle/distributed/launch/context/resource.py
index e02a439025b77..faffed704c1f0 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch/context/resource.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.distributed.fleet import launch
-launch.launch()
 
-__all__ = []
+class Resource(object):
+    def __init__(self):
+        self.devices = []
diff --git a/python/paddle/distributed/launch/context/status.py b/python/paddle/distributed/launch/context/status.py
new file mode 100644
index 0000000000000..cfbf3623ec22e
--- /dev/null
+++ b/python/paddle/distributed/launch/context/status.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Status(object):
+    UNINIT = "uninit"
+    READY = "ready"
+    RUNNING = "running"
+    FAILED = "failed"
+    TERMINATING = "terminating"
+    RESTARTING = "restarting"
+    UNKNOWN = "unknown"
+    COMPLETED = "completed"
+    DONE = "done"  # should exit whatever status
+
+    def __init__(self):
+        self._current_status = None
+
+    def current(self):
+        return self._current_status
+
+    def is_running(self):
+        return self._current_status == self.RUNNING
+
+    def is_restarting(self):
+        return self._current_status == self.RESTARTING
+
+    def is_done(self):
+        if self._current_status in [self.DONE, self.COMPLETED, self.FAILED]:
+            return True
+        else:
+            return False
+
+    def run(self):
+        self._current_status = self.RUNNING
+
+    def fail(self):
+        self._current_status = self.FAILED
+
+    def complete(self):
+        self._current_status = self.COMPLETED
+
+    def restart(self):
+        self._current_status = self.RESTARTING
+
+    def done(self):
+        self._current_status = self.DONE
diff --git a/python/paddle/distributed/launch/controllers/__init__.py b/python/paddle/distributed/launch/controllers/__init__.py
new file mode 100644
index 0000000000000..706131300f0d8
--- /dev/null
+++ b/python/paddle/distributed/launch/controllers/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = []
+
+from .collective import CollectiveController
+from .collective import CollectiveElasticController
+from .ps import PSController
+
+# the order is extremely important
+_controllers = [
+    CollectiveElasticController,
+    PSController,
+    CollectiveController,
+]
+
+
+def init(ctx):
+    for c in _controllers:
+        if c.enable(ctx):
+            return c(ctx)
diff --git a/python/paddle/distributed/launch/controllers/collective.py b/python/paddle/distributed/launch/controllers/collective.py
new file mode 100644
index 0000000000000..c3fa4e6e07de9
--- /dev/null
+++ b/python/paddle/distributed/launch/controllers/collective.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .controller import Controller
+
+import json
+import os
+import six
+import time
+
+
+class CollectiveController(Controller):
+    @classmethod
+    def enable(cls, ctx):
+        if ctx:
+            ctx.logger.debug("{} enabled".format(cls.__name__))
+            return True
+        else:
+            return False
+
+    def build_pod(self):
+        self.pod.replicas = self.pod_replicas()
+
+        # rank will be reset when restart
+        self.pod.rank = self.ctx.args.rank
+
+        port = self.ctx.node.get_free_port()
+
+        # compatible
+        endpoints = [
+            "{}:{}".format(self.ctx.node.ip, p)
+            for p in self.ctx.node.get_free_ports(self.pod.replicas)
+        ]
+
+        data = json.dumps({
+            'name': self.pod.name,
+            'rank': self.pod.rank,
+            'replicas': self.pod.replicas,
+            'dtype': self.ctx.node.device.dtype,
+            'candidate': '{}:{}'.format(self.ctx.node.ip, port),
+            'endpoints': ",".join(endpoints),
+        })
+
+        peer_list, rank = self.master.sync_peers(
+            '/{}/info'.format(self.job.id), self.pod.name, data,
+            self.job.replicas, self.pod.rank)
+        self.pod.rank = rank
+
+        if len(peer_list) < 1:
+            return False
+
+        peer_list = [json.loads(i) for i in peer_list]
+
+        self.ctx.logger.debug("sync peers done {}".format(peer_list))
+        self.save_pod_log(peer_list)
+
+        global_size = sum([i['replicas'] for i in peer_list])
+        rank_offset = sum([i['replicas'] for i in peer_list[:rank]])
+        '''
+        The new designed collective need nothing but a master endpoint
+        '''
+        collective_master = peer_list[0]['candidate']
+
+        job_endpoints = [i['endpoints'] for i in peer_list]
+
+        self.pod.reset()
+        for i in range(self.pod.replicas):
+            e = {
+                "PADDLE_MASTER": collective_master,
+                "PADDLE_GLOBAL_SIZE": "{}".format(global_size),
+                "PADDLE_LOCAL_SIZE": "{}".format(self.pod.replicas),
+                "PADDLE_GLOBAL_RANK": "{}".format(i + rank_offset),
+                "PADDLE_LOCAL_RANK": "{}".format(i),
+                ## compatible env
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(job_endpoints),
+                "PADDLE_CURRENT_ENDPOINT": endpoints[i],
+                "PADDLE_TRAINER_ID": "{}".format(i + rank_offset),
+                "PADDLE_TRAINERS_NUM": "{}".format(global_size),
+                "PADDLE_RANK_IN_NODE": str(i),
+            }
+            if self.pod.replicas == 1:
+                e.update(self.ctx.node.device.selected_flags())
+            else:
+                e.update(self.ctx.node.device.selected_flags(i))
+            self.add_container(envs=e, log_tag=i)
+
+        return True
+
+
+class CollectiveElasticController(CollectiveController):
+    @classmethod
+    def enable(cls, ctx):
+        if ctx.args.master and ctx.args.master.startswith("etcd://"):
+            ctx.logger.debug("{} enabled".format(cls.__name__))
+            return True
+        else:
+            return False
+
+    def register(self):
+        if self.job.id == 'default':
+            self.ctx.logger.warning(
+                'Using default job name may cause conflict, add --job_id in args'
+            )
+
+        self.master.register_heartbeat(self.job.id, self.pod.name)
+
+    def watch(self) -> bool:
+        '''
+        watch self and peer status, return true to exit
+        '''
+
+        self.ctx.logger.info("Watching {}".format(self.pod))
+        while not self.ctx.status.is_done():
+            # self status
+            status = self.pod.watch(timeout=2)
+            self.ctx.logger.debug("Pod status {}, Ctx status {}".format(
+                status, self.ctx.status.current()))
+
+            # completed
+            if status == self.ctx.status.COMPLETED:
+                self.master.set_status(status)
+                self.ctx.status.complete()
+                self.ctx.logger.info("Pod complete {}".format(status))
+                return True
+
+            # self failure
+            elif status == self.ctx.status.FAILED:
+                self.master.set_status(status)
+                self.master.restart_peer()
+                self.ctx.logger.info("Pod failed {}".format(status))
+                self.pod.stop()
+
+                if self.ctx.args.elastic_level <= 0:
+                    return True
+                else:
+                    return False
+
+            # peer failure
+            if self.ctx.status.is_restarting() and self.master.get_status(
+            ) != self.ctx.status.COMPLETED:
+                self.pod.stop()
+                return False
+
+            #peers = self.master.fetch_peer_alive()
+            #print("peers {}".format(peers))
+
+    def run(self):
+
+        timeout = self.ctx.args.elastic_timeout if self.job.elastic else self.ctx.args.elastic_timeout * 10
+        self.register()
+
+        while self.pod.restart <= self.ctx.args.max_restart:
+
+            self.build_job()
+
+            ok, replicas = self.master.wait_peer_ready(
+                self.job.replicas_min, self.job.replicas_max, timeout)
+            if ok:
+                self.job.replicas = replicas
+            else:
+                self.ctx.logger.warnning("peer not ready {}".format(self.job))
+                break
+
+            self.ctx.logger.debug("Run {}".format(self.job))
+
+            if not self.build_pod():
+                continue
+
+            self.master.set_status(self.ctx.status.RUNNING)
+
+            self.deploy_pod()
+
+            if self.watch():
+                break
+
+        self.ctx.logger.debug("Job done {}".format(self.job))
diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py
new file mode 100644
index 0000000000000..60e34b85a12bc
--- /dev/null
+++ b/python/paddle/distributed/launch/controllers/controller.py
@@ -0,0 +1,197 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import signal
+
+from paddle.distributed.launch.job.job import Job
+from paddle.distributed.launch.job.pod import Pod
+from paddle.distributed.launch.job.container import Container
+
+from .master import Master
+
+import time
+
+
+class ControleMode:
+    COLLECTIVE = "collective"
+    PS = "ps"
+
+
+class ControllerBase(object):
+    def __init__(self, ctx):
+        signal.signal(signal.SIGTERM, self.signal_handler)
+        signal.signal(signal.SIGABRT, self.signal_handler)
+        signal.signal(signal.SIGINT, self.signal_handler)
+
+        self.ctx = ctx
+        self.master = Master.factory(self.ctx)
+
+        self.job = Job(nnodes=self.ctx.args.nnodes,
+                       mode=self.ctx.args.mode,
+                       jid=self.ctx.args.job_id)
+        self.pod = Pod()
+
+        self.join_server = None
+
+    def deploy_pod(self):
+
+        assert len(self.pod.containers) > 0, "No container in the pod"
+
+        self.ctx.logger.info("Run {}".format(self.pod))
+        self.ctx.logger.debug(self.pod.containers[0])
+
+        self.ctx.status.run()
+        self.pod.deploy()
+
+    def run(self):
+        self.build_job()
+        self.build_pod()
+
+        self.deploy_pod()
+
+        self.watch()
+
+    def watch(self) -> bool:
+        self.ctx.logger.info("Watching {}".format(self.pod))
+
+        status = self.pod.watch()
+
+        if status == self.ctx.status.COMPLETED:
+            self.ctx.logger.info("Pod {}".format(status))
+        elif status == self.ctx.status.FAILED:
+            fc = self.pod.failed_container()
+            self.ctx.logger.info("Pod {}".format(status))
+            self.ctx.logger.error("Container failed !!!\n{}".format(fc[0]))
+            fc[0].tail()
+            self.pod.stop()
+
+    def stop(self, sigint=None):
+        self.ctx.logger.debug("Controller stop")
+        self.master.stop()
+        self.pod.stop(sigint)
+
+    def finalize(self):
+        self.pod.join()
+        self.master.stop()
+
+        self.ctx.logger.info("Exit code {}".format(self.pod.exit_code))
+        sys.exit(self.pod.exit_code)
+
+    def signal_handler(self, sigint, frame):
+        self.ctx.logger.info("Terminating with signal {}".format(sigint))
+
+        if hasattr(self, 'sigint'):
+            time.sleep(5)
+            sys.exit(sigint)
+
+        self.sigint = sigint
+        self.ctx.status.done()
+        self.stop(sigint)
+        time.sleep(1)
+        self.ctx.logger.debug("Exit with signal {}".format(sigint))
+        sys.exit(sigint)
+
+
+class Controller(ControllerBase):
+    '''
+    Controller API for customization
+    '''
+
+    def build_job(self):
+        '''
+        build job fill the job info.
+        '''
+        self.ctx.logger.info(self.job)
+
+    def build_pod(self) -> bool:
+        '''
+        build pod includes creating containers etc.
+
+        Return True if succeed
+        '''
+        raise NotImplementedError
+
+    def _get_entrypoint(self):
+        entrypoint = [sys.executable, "-u", self.ctx.args.training_script]
+        entrypoint.extend(self.ctx.args.training_script_args)
+        return entrypoint
+
+    def _get_out_err_file(self, out=None, err=None):
+        if out and self.ctx.args.log_dir != "":
+            out = os.path.join(self.ctx.args.log_dir, out)
+        if err and self.ctx.args.log_dir != "":
+            err = os.path.join(self.ctx.args.log_dir, err)
+        return out, (err or out)
+
+    def new_container(self,
+                      entrypoint=None,
+                      envs={},
+                      use_ctx_env=True,
+                      out=None,
+                      err=None):
+        c = Container(
+            entrypoint=(entrypoint or self._get_entrypoint()),
+            env=(self.ctx.get_envs() if use_ctx_env else {}), )
+        c.outfile, c.errfile = self._get_out_err_file(out, err)
+        c.update_env(envs)
+        return c
+
+    def add_container(self,
+                      container=None,
+                      entrypoint=None,
+                      envs={},
+                      log_tag=None,
+                      is_init=False):
+        if not is_init and log_tag is not None:
+            log_file = "{}.{}.{}.log".format(self.job.id, self.pod.name,
+                                             log_tag)
+        else:
+            log_file = None
+
+        if not container:
+            container = self.new_container(
+                entrypoint=entrypoint, envs=envs, out=log_file, err=log_file)
+
+        if is_init:
+            self.pod.add_init_container(container)
+        else:
+            self.pod.add_container(container)
+
+    def pod_replicas(self):
+        '''
+        how many process/container should be run in pod
+        '''
+
+        if self.ctx.args.nproc_per_node:
+            return int(self.ctx.args.nproc_per_node)
+        else:
+            return self.ctx.node.device.count
+
+    def save_pod_log(self, info):
+        '''
+        save_pod_log append *info* to the log file of pod.name
+        '''
+        if not self.ctx.args.log_dir:
+            return
+
+        f = os.path.join(self.ctx.args.log_dir,
+                         '{}.{}.log'.format(self.job.id, self.pod.name))
+        try:
+            os.makedirs(os.path.dirname(f), exist_ok=True)
+            with open(f, 'a+') as fd:
+                fd.write(str(info))
+        except Exception as e:
+            self.ctx.logger.error("save log failed because {}".format(e))
diff --git a/python/paddle/distributed/launch/controllers/master.py b/python/paddle/distributed/launch/controllers/master.py
new file mode 100644
index 0000000000000..f9f484eb125ee
--- /dev/null
+++ b/python/paddle/distributed/launch/controllers/master.py
@@ -0,0 +1,298 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.distributed.launch.utils.kv_client import KVClient
+from paddle.distributed.launch.utils.kv_server import KVServer
+
+import time
+import sys
+import six
+import threading
+import copy
+import random
+
+ETCD_PROTOCAL = 'etcd://'
+
+
+class Master(object):
+    '''
+    Master is a distributed store design to exchange info among nodes
+    '''
+
+    MAIN = "main"
+    STANDBY = "standby"
+    PATICIPANT = "participant"
+
+    def __init__(self, ctx):
+        self.ctx = ctx
+        self.server = None
+        self.initialized = False
+        self.endpoint = None
+
+    def stop(self):
+        raise NotImplementedError
+
+    def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int):
+        raise NotImplementedError
+
+    @classmethod
+    def factory(cls, ctx):
+        if ctx.args.master and ctx.args.master.startswith(ETCD_PROTOCAL):
+            return ETCDMaster(ctx)
+        else:
+            return HTTPMaster(ctx)
+
+
+class HTTPMaster(Master):
+    def lazy_init(self):
+        if self.initialized:
+            return
+
+        self.role = Master.PATICIPANT
+
+        if self.ctx.args.master:
+            self.endpoint = self.ctx.args.master
+            ip, port = self.endpoint.split(':')
+            if ip in ['127.0.0.1', self.ctx.node.ip]:
+                time.sleep(2 * random.random())
+                while not self.ctx.node.is_server_ready(ip, int(port)):
+                    try:
+                        self.server = KVServer(int(port))
+                        self.role = Master.MAIN
+                        break
+                    except Exception as e:
+                        self.ctx.logger.warning("start master failed {}".format(
+                            e))
+                        time.sleep(0.1)
+                        continue
+        else:
+            port = self.ctx.node.get_free_port()
+            self.endpoint = "{}:{}".format(self.ctx.node.ip, port)
+            self.server = KVServer(port)
+            self.role = Master.MAIN
+
+            print("Copy the following command to other nodes to run.")
+            cmd = [
+                sys.executable.split('/')[-1], "-m", "paddle.distributed.launch"
+            ]
+            cmd.extend(["--master", self.endpoint])
+            cmd.extend(sys.argv[1:])
+            print("-" * 80)
+            print(" ".join(cmd))
+            print("-" * 80)
+
+            if self.ctx.args.rank >= 0:
+                self.ctx.logger.warning(
+                    "--rank set in the command may not compatible in auto mode")
+
+        if '127.0.0.1' in self.endpoint:
+            self.endpoint = self.endpoint.replace('127.0.0.1', self.ctx.node.ip)
+        self.client = KVClient(self.endpoint)
+
+        self.initialized = True
+
+        self._start_server()
+
+    def _start_server(self):
+        if self.server and not self.server.started:
+            self.server.start()
+            self.ctx.logger.debug("KV server start at {}".format(self.endpoint))
+
+    def _stop_server(self):
+        if self.server and not self.server.stopped:
+            self.server.stop()
+            self.ctx.logger.debug("KV server stopped")
+
+    def stop(self):
+        self._stop_server()
+
+    def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int):
+
+        if size < 2:
+            return [value], 0
+
+        self.ctx.logger.info("Waiting peer ready...")
+
+        self.lazy_init()
+
+        while not self.ctx.status.is_done():
+            if self.client.wait_server_ready(timeout=5):
+                break
+            else:
+                self.ctx.logger.warning("master not ready")
+                time.sleep(0.1)
+
+        # 'aaaaaa' make sure main pod (master server) as rank 0
+        ky = 'aaaaaa' if rank < 0 and self.role == Master.MAIN else key
+        k = "{}/{}/{}".format(prefix, ky, rank)
+
+        while not self.ctx.status.is_done():
+            if not self.client.put(k, value):
+                self.ctx.logger.warning("put value failed")
+                time.sleep(0.1)
+                continue
+
+            rjson = self.client.get_prefix(prefix)
+            self.ctx.logger.debug("sync peers {}".format(rjson))
+            if rjson and len(rjson) == size:
+                if rank < 0:
+                    keys = list(rjson.keys())
+                    keys.sort()
+                    ret = [rjson[k] for k in keys]
+                    idx = ret.index(value)
+                    return ret, idx
+                else:
+                    ret = [None] * size
+                    for k, v in rjson.items():
+                        ret[int(k.split('/')[-1])] = v
+                    return ret, rank
+            else:
+                time.sleep(0.5)
+        return [], 0
+
+
+class ETCDMaster(Master):
+    def __init__(self, ctx):
+        super().__init__(ctx)
+
+        if self.ctx.args.master:
+            # etcd://localhost:2379
+            self.endpoint = self.ctx.args.master.strip("etcd://")
+
+        import etcd3
+
+        host, port = self.endpoint.split(':')
+        self.client = etcd3.client(host=host, port=port)
+
+    def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int):
+        '''
+        sync_peers gather all value for key under scope prefix
+        result always be sorted either by rank or alphabet of pod.name
+        '''
+
+        if size < 2:
+            return [value], 0
+
+        self.ctx.logger.info("Waiting peer ready...")
+
+        path = "{}/{}/{}".format(prefix, key, rank)
+
+        self.client.delete_prefix(prefix)
+
+        self.ctx.logger.debug("sync path {} value {}".format(path, value))
+
+        while not self.ctx.status.is_done():
+            self.client.put(path, six.b(value))
+
+            result = [i for i in self.client.get_prefix(prefix)]
+            result = copy.deepcopy(result)
+            self.ctx.logger.debug("sync peers {}".format(result))
+
+            if len(result) == size:
+                if rank < 0:
+                    keys = [six.ensure_str(i[1].key) for i in result]
+                    sorted_keys = [six.ensure_str(i[1].key) for i in result]
+                    sorted_keys.sort()
+                    values = [six.ensure_str(i[0]) for i in result]
+                    ret = [values[keys.index(k)] for k in sorted_keys]
+                    idx = ret.index(value)
+                    return ret, idx
+                else:
+                    ret = [None] * size
+                    for v, k in result:
+                        ii = int(six.ensure_str(k.key).split('/')[-1])
+                        if ii < 0:
+                            self.ctx.logger.error(
+                                "rank {} error in sync".format(ii))
+                        ret[ii] = six.ensure_str(v)
+                    return ret, rank
+            else:
+                time.sleep(0.5)
+
+    def register_heartbeat(self, job_id, pod_id, ttl=10):
+        if hasattr(self, 'heartbeat_prefix'):
+            self.ctx.logger.warning("Heartbeat already done")
+            return
+
+        self.job_prefix = '/paddle/{}'.format(job_id)
+        self.heartbeat_prefix = '{}/heartbeat'.format(self.job_prefix)
+
+        lease = self.client.lease(ttl)
+
+        #self.client.delete_prefix(self.job_prefix)
+
+        beat_path = "{}/{}".format(self.heartbeat_prefix, pod_id)
+        self.client.put(beat_path, six.b(pod_id), lease=lease)
+
+        def _beat_watch(event):
+            self.ctx.status.restart()
+
+        beat_watch = self.client.add_watch_prefix_callback(
+            self.heartbeat_prefix, _beat_watch)
+
+        def _heartbeat():
+            while not self.ctx.status.is_done():
+                try:
+                    lease.refresh()
+                    if pod_id not in self.fetch_peer_alive():
+                        self.client.put(beat_path, six.b(pod_id), lease=lease)
+                        self.ctx.logger.debug("Heartbeat register again")
+                except Exception as e:
+                    self.ctx.logger.error("Heartbeat error {}".format(e))
+                time.sleep(ttl / 2)
+            self.ctx.logger.debug("Heartbeat done")
+            self.client.cancel_watch(beat_watch)
+
+        self.beat_thread = threading.Thread(
+            name='heartbeat', target=_heartbeat, daemon=True)
+        self.beat_thread.start()
+
+    def fetch_peer_alive(self):
+        peer_alive = [
+            six.ensure_str(i[0])
+            for i in self.client.get_prefix(self.heartbeat_prefix)
+        ]
+        self.ctx.logger.debug("peer alive {}".format(peer_alive))
+        return peer_alive
+
+    def wait_peer_ready(self, replicas_min, replicas_max, timeout):
+        end = time.time() + timeout
+        while not self.ctx.status.is_done() and time.time() < end:
+            if len(self.fetch_peer_alive()) == replicas_max:
+                return (True, replicas_max)
+            else:
+                time.sleep(0.5)
+
+        np = len(self.fetch_peer_alive())
+        if np >= replicas_min and np <= replicas_max:
+            return (True, np)
+        else:
+            return (False, np)
+
+    def restart_peer(self):
+        self.client.delete_prefix(self.heartbeat_prefix)
+
+    def set_status(self, status):
+        assert self.client.put(
+            self.job_prefix, six.b(status),
+            lease=self.client.lease(600)), "set status failed {}".format(status)
+
+    def get_status(self):
+        return six.ensure_str(self.client.get(self.job_prefix)[0] or '')
+
+    def stop(self):
+        if hasattr(self, 'beat_thread'):
+            self.ctx.status.done()
+            # TODO(kuizhiqing) thread should exit
+            #self.beat_thread.join()
diff --git a/python/paddle/distributed/launch/controllers/ps.py b/python/paddle/distributed/launch/controllers/ps.py
new file mode 100644
index 0000000000000..d3d0ef59bfd2f
--- /dev/null
+++ b/python/paddle/distributed/launch/controllers/ps.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .controller import Controller, ControleMode
+
+import json
+import os, shutil
+
+
+class PSController(Controller):
+    @classmethod
+    def enable(cls, ctx):
+        if ctx.args.mode == ControleMode.PS or ctx.args.server_num or len(
+                ctx.args.servers) > 0 or ctx.args.trainer_num or len(
+                    ctx.args.trainers) > 0:
+            ctx.logger.debug("{} enabled".format(cls.__name__))
+            ctx.args.mode = ControleMode.PS
+            return True
+        else:
+            return False
+
+    def build_pod(self):
+        if self.ctx.args.servers and self.ctx.args.trainers:
+            self._build_pod_with_args()
+        else:
+            self._build_pod_with_master()
+
+    def _build_pod_with_args(self):
+        if '127.0.0.1' in self.ctx.args.servers:
+            host = '127.0.0.1'
+        else:
+            host = self.ctx.node.ip
+
+        server_endpoints = [s for s in self.ctx.args.servers.split(",")]
+        trainer_endpoints = [s for s in self.ctx.args.trainers.split(",")]
+        servers = [
+            s for s in self.ctx.args.servers.split(",") if s.startswith(host)
+        ]
+        trainers = [
+            s for s in self.ctx.args.trainers.split(",") if s.startswith(host)
+        ]
+        server_num = len(servers)
+        trainer_num = len(trainers)
+
+        self.pod.replicas = server_num + trainer_num
+
+        self.save_pod_log([server_endpoints, trainer_endpoints])
+
+        import tempfile
+        gloo_rendezvous_dir = tempfile.mkdtemp()
+        if os.path.exists(gloo_rendezvous_dir):
+            shutil.rmtree(gloo_rendezvous_dir)
+
+        gloo_port = self.ctx.args.gloo_port
+        gloo_http = "{}:{}".format(server_endpoints[0].split(":")[0], gloo_port)
+
+        _gloo_envs = {
+            "PADDLE_GLOO_RENDEZVOUS": "3",
+            "PADDLE_GLOO_FS_PATH": gloo_rendezvous_dir,
+            "PADDLE_GLOO_HTTP_ENDPOINT": gloo_http,
+            "PADDLE_WITH_GLOO": self.ctx.args.with_gloo
+        }
+
+        for i in range(server_num):
+            e = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": self.ctx.args.servers,
+                "PADDLE_TRAINER_ENDPOINTS": self.ctx.args.trainers,
+                "PADDLE_PORT": servers[i].split(":")[1],
+                "PADDLE_ROLE": "PSERVER",
+                "TRAINING_ROLE": "PSERVER",
+                "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)),
+                "POD_IP": self.ctx.node.ip,
+            }
+            e.update(_gloo_envs)
+            log_tag = "ps.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+
+        trainer_rank_offset = 0
+        for s in trainer_endpoints:
+            if s.startswith(host):
+                break
+            else:
+                trainer_rank_offset += 1
+
+        for i in range(trainer_num):
+            e = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_PORT": trainers[i].split(":")[1],
+                "PADDLE_ROLE": "TRAINER",
+                "TRAINING_ROLE": "TRAINER",
+                "PADDLE_TRAINER_ID": "{}".format(i + trainer_rank_offset),
+                "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)),
+                "POD_IP": self.ctx.node.ip,
+            }
+            e.update(_gloo_envs)
+            log_tag = "trainer.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+
+    def _build_pod_with_master(self):
+
+        self.pod.rank = self.ctx.args.rank
+
+        server_num = self.ctx.args.server_num or 1
+        servers = [
+            "{}:{}".format(self.ctx.node.ip, p)
+            for p in self.ctx.node.get_free_ports(server_num)
+        ]
+        trainer_num = self.ctx.args.trainer_num or 1
+        trainers = [
+            "{}:{}".format(self.ctx.node.ip, p)
+            for p in self.ctx.node.get_free_ports(trainer_num)
+        ]
+
+        data = json.dumps({
+            'name': self.pod.name,
+            'rank': self.pod.rank,
+            'servers': servers,
+            'trainers': trainers,
+            'dtype': self.ctx.node.device.dtype,
+            'gloo_port': self.ctx.node.get_free_port(),
+        })
+
+        peer_list, rank = self.master.sync_peers(
+            '/{}/info'.format(self.job.id), self.pod.name, data,
+            self.job.replicas, self.pod.rank)
+
+        self.ctx.logger.debug("sync peers done {}".format(peer_list))
+
+        peer_list = [json.loads(i) for i in peer_list]
+
+        self.save_pod_log(peer_list)
+
+        server_endpoints = [j for i in peer_list for j in i['servers']]
+        trainer_endpoints = [j for i in peer_list for j in i['trainers']]
+        #rank_offset = sum([i['replicas'] for i in peer_list[:rank]])
+
+        server_rank_offset = sum([len(i['servers']) for i in peer_list[:rank]])
+        trainer_rank_offset = sum(
+            [len(i['trainers']) for i in peer_list[:rank]])
+
+        self.pod.rank = rank
+
+        self.pod.replicas = server_num + trainer_num
+
+        import tempfile
+        gloo_rendezvous_dir = tempfile.mkdtemp()
+        if os.path.exists(gloo_rendezvous_dir):
+            shutil.rmtree(gloo_rendezvous_dir)
+
+        gloo_port = peer_list[0]['gloo_port']
+        gloo_http = "{}:{}".format(server_endpoints[0].split(":")[0], gloo_port)
+
+        _gloo_envs = {
+            "PADDLE_GLOO_RENDEZVOUS": "3",
+            "PADDLE_GLOO_FS_PATH": gloo_rendezvous_dir,
+            "PADDLE_GLOO_HTTP_ENDPOINT": gloo_http,
+            "PADDLE_WITH_GLOO": self.ctx.args.with_gloo
+        }
+
+        for i in range(server_num):
+            e = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_PORT":
+                server_endpoints[i + server_rank_offset].split(":")[1],
+                "PADDLE_ROLE": "PSERVER",
+                "TRAINING_ROLE": "PSERVER",
+                "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)),
+                "POD_IP": self.ctx.node.ip,
+            }
+            e.update(_gloo_envs)
+            log_tag = "ps.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+
+        for i in range(trainer_num):
+            e = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_PORT":
+                trainer_endpoints[i + trainer_rank_offset].split(":")[1],
+                "PADDLE_ROLE": "TRAINER",
+                "TRAINING_ROLE": "TRAINER",
+                "PADDLE_TRAINER_ID": "{}".format(i + trainer_rank_offset),
+                "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)),
+                "POD_IP": self.ctx.node.ip,
+            }
+            e.update(_gloo_envs)
+            log_tag = "trainer.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+        ''' NEW VERSION
+        for i in range(server_num):
+            e = {
+                "PADDLE_PSERVER_ENDPOINTS": ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_ROLE": "PSERVER",
+                "PADDLE_RANK": "{}".format(i + server_rank_offset),
+            }
+            log_tag = "ps.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+
+        for i in range(trainer_num):
+            e = {
+                "PADDLE_PSERVER_ENDPOINTS": ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_ROLE": "TRAINER_CPU",
+                "PADDLE_RANK": "{}".format(i + trainer_rank_offset),
+            }
+            log_tag = "trainer.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+        '''
diff --git a/python/paddle/distributed/launch/job/__init__.py b/python/paddle/distributed/launch/job/__init__.py
new file mode 100644
index 0000000000000..97043fd7ba688
--- /dev/null
+++ b/python/paddle/distributed/launch/job/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/launch/job/container.py b/python/paddle/distributed/launch/job/container.py
new file mode 100644
index 0000000000000..7105cae9024f2
--- /dev/null
+++ b/python/paddle/distributed/launch/job/container.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from paddle.distributed.launch.utils.process_context import ProcessContext
+
+from .status import Status
+
+import os, copy, sys
+
+
+class Container(object):
+    '''
+    TODO(kuizhiqing) A container can be run by process/thread or just a callable function
+    '''
+
+    def __init__(self, entrypoint=[], rank=-1, env={}):
+        self._entrypoint = entrypoint
+        self._rank = rank
+        self._out = None
+        self._err = None
+        self._env = env
+        self._proc = None
+
+        self._retry: int = 3
+        self._grace_period = 10
+
+        self._log_handler = None
+
+    @property
+    def entrypoint(self):
+        return self._entrypoint
+
+    @entrypoint.setter
+    def entrypoint(self, entry):
+        self._entrypoint = entry
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @rank.setter
+    def rank(self, r):
+        self._rank = r
+
+    @property
+    def outfile(self):
+        return self._out
+
+    @outfile.setter
+    def outfile(self, out):
+        self._out = out
+
+    @property
+    def errfile(self):
+        return self._err
+
+    @errfile.setter
+    def errfile(self, err):
+        self._err = err
+
+    def update_env(self, env={}, **kwargs):
+        env = {k: v for k, v in env.items() if isinstance(v, str)}
+        self._env.update(env)
+
+        kwargs = {k: v for k, v in kwargs.items() if isinstance(v, str)}
+        self._env.update(kwargs)
+
+    def _valide_env(self):
+        for k, v in self._env.items():
+            assert isinstance(k, str) and isinstance(
+                v, str), 'env {}:{} must be str'.format(k, v)
+
+    def _get_fd(self, pth):
+        if not pth:
+            return None
+
+        try:
+            d = os.path.dirname(pth)
+            if not os.path.isdir(d):
+                os.makedirs(d, exist_ok=True)
+            return open(pth, 'w')
+        except:
+            return None
+
+    def start(self):
+        if self._proc and self._proc.alive():
+            return True
+
+        self._valide_env()
+
+        self._stdout = self._get_fd(self._out) or sys.stdout
+        if self._out == self._err:
+            self._stderr = self._stdout
+        elif self._err:
+            self._stderr = self._get_fd(self._err) or sys.stderr
+
+        self._proc = ProcessContext(
+            self._entrypoint, env=self._env, out=self._stdout, err=self._stderr)
+        self._proc.start()
+
+    def terminate(self, force=False):
+        if self._log_handler:
+            self._log_handler.close()
+            self._log_handler = None
+
+        if self._proc and self._proc.alive():
+            return self._proc.terminate(force)
+
+    def wait(self, timeout=None):
+        self._proc.wait(timeout)
+
+    @property
+    def exit_code(self):
+        return self._proc.exit_code() if self._proc else -1
+
+    @property
+    def status(self):
+        if not self._proc:
+            return Status.UNINIT
+        if self._proc.alive():
+            return Status.RUNNING
+        elif self._proc.exit_code() == 0:
+            return Status.COMPLETED
+        else:
+            return Status.FAILED
+
+    def __str__(self):
+        return 'Container rank {} status {} cmd {} code {} log {} \nenv {}'.format(
+            self._rank,
+            self.status,
+            self._entrypoint,
+            self.exit_code,
+            self.errfile,
+            self._env, )
+
+    def logs(self, fn=None, offset=0, whence=1, lines=1000):
+        if not self._log_handler:
+            self._log_handler = open(self._out)
+
+        if fn is None:
+            fn = sys.stdout
+
+        self._log_handler.seek(offset, whence)
+
+        try:
+            idx = 0
+            for line in self._log_handler:
+                fn.write(line)
+                idx += 1
+                if idx > lines:
+                    break
+        finally:
+            return self._log_handler.tell()
+
+    def tail(self, length=3000):
+        if not self._log_handler:
+            self._log_handler = open(self._out)
+
+        self._log_handler.seek(0, 2)
+        ed = self._log_handler.tell()
+
+        if ed > length:
+            self.logs(offset=ed - length, whence=0)
+        else:
+            self.logs(offset=0, whence=0)
diff --git a/python/paddle/distributed/launch/job/job.py b/python/paddle/distributed/launch/job/job.py
new file mode 100644
index 0000000000000..31827968ddce6
--- /dev/null
+++ b/python/paddle/distributed/launch/job/job.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class JobMode:
+    COLLECTIVE = 'collective'
+    PS = 'ps'
+    HETER = 'heter'
+
+
+class Job(object):
+    def __init__(self, jid='default', mode=JobMode.COLLECTIVE, nnodes="1"):
+        self._mode = mode
+        self._id = jid
+
+        self._replicas = 0
+        self._replicas_min = self._replicas
+        self._replicas_max = self._replicas
+        self._elastic = False
+
+        self.set_replicas(str(nnodes))
+
+    def __str__(self):
+        return "Job: {}, mode {}, replicas {}[{}:{}], elastic {}".format(
+            self.id, self.mode, self._replicas, self._replicas_min,
+            self._replicas_max, self.elastic)
+
+    @property
+    def mode(self):
+        return self._mode
+
+    @property
+    def id(self):
+        return self._id
+
+    @property
+    def elastic(self):
+        return self._elastic
+
+    @property
+    def replicas(self):
+        return self._replicas
+
+    @property
+    def replicas_min(self):
+        return self._replicas_min
+
+    @property
+    def replicas_max(self):
+        return self._replicas_max
+
+    @replicas.setter
+    def replicas(self, replicas):
+        self._replicas = replicas
+
+    def set_replicas(self, nnodes: str):
+        np = str(nnodes) if nnodes else '1'
+
+        if ':' in np:
+            nps = np.split(':')
+            self._replicas_min, self._replicas_max = int(nps[0]), int(nps[1])
+            self._replicas = self._replicas_max  # default to max
+
+            self._elastic = True
+        else:
+            self._replicas = int(np)
+            self._replicas_min, self._replicas_max = self._replicas, self._replicas
+
+            self._elastic = False
diff --git a/python/paddle/distributed/launch/job/pod.py b/python/paddle/distributed/launch/job/pod.py
new file mode 100644
index 0000000000000..701adf45f94e8
--- /dev/null
+++ b/python/paddle/distributed/launch/job/pod.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from .container import Container
+
+from .status import Status
+
+import random
+import time
+
+
+class PodSepc(object):
+    def __init__(self):
+        self._name = ''.join(
+            random.choice('abcdefghijklmnopqrstuvwxyz') for _ in range(6))
+
+        # by controller
+        self._init_containers: List[Container] = []
+        self._containers: List[Container] = []
+
+        #self.resource: Resource = None
+        #self.status: Status = None
+
+        self._rank = -1
+        self._init_timeout = None
+        self._restart = -1
+        self._replicas = 0  # number of containers
+        self._exit_code = 0
+
+
+class Pod(PodSepc):
+    def __init__(self):
+        super().__init__()
+
+    def __str__(self):
+        return "Pod: {}, replicas {}, status {}".format(
+            self.name, self.replicas, self.status)
+
+    def failed_container(self):
+        cs = []
+        for c in self._containers:
+            if c.status == Status.FAILED:
+                cs.append(c)
+        return cs
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def replicas(self):
+        return self._replicas
+
+    @replicas.setter
+    def replicas(self, r):
+        self._replicas = max(r, 1)
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @rank.setter
+    def rank(self, r):
+        self._rank = r
+
+    @property
+    def restart(self):
+        return self._restart
+
+    @property
+    def containers(self):
+        return self._containers
+
+    def add_container(self, c):
+        c.rank = len(self._containers)
+        self._containers.append(c)
+
+    @property
+    def init_containers(self):
+        return self._init_containers
+
+    def add_init_container(self, c):
+        c.rank = len(self._init_containers)
+        self._init_containers.append(c)
+
+    @property
+    def exit_code(self):
+        for c in self._containers:
+            if c.exit_code != 0:
+                return c.exit_code
+        return 0
+
+    def deploy(self):
+        # init container should stop before run containers
+        for i in self._init_containers:
+            i.start()
+            i.wait(self._init_timeout)
+
+        for c in self._containers:
+            c.start()
+
+        self._restart += 1
+
+    def stop(self, sigint=0):
+        for c in self._containers:
+            force = True if sigint == 9 else False
+            c.terminate(force)
+
+    def join(self):
+        for c in self._containers:
+            c.wait(None)
+
+    @property
+    def status(self):
+        if self.is_failed():
+            return Status.FAILED
+
+        if self.is_completed():
+            return Status.COMPLETED
+
+        if self.is_running():
+            return Status.RUNNING
+
+        return Status.READY
+
+    def reset(self):
+        self._init_containers = []
+        self._containers = []
+
+    def is_failed(self):
+        for c in self._containers:
+            if c.status == Status.FAILED:
+                return True
+        return False
+
+    def is_completed(self):
+        for c in self._containers:
+            if c.status != Status.COMPLETED:
+                return False
+        return True
+
+    def is_running(self):
+        for c in self._containers:
+            if c.status != Status.RUNNING:
+                return False
+        return True
+
+    def logs(self, idx=None):
+        if idx is None:
+            self._containers[0].logs()
+        else:
+            self._containers[idx].logs()
+
+    def tail(self, idx=None):
+        if idx is None:
+            self._containers[0].tail()
+        else:
+            self._containers[idx].tail()
+
+    def watch(self,
+              all_list=[Status.COMPLETED],
+              any_list=[Status.FAILED],
+              interval=1,
+              timeout=-1):
+        '''
+        watch return if any container status in any_list
+        or all container status in all_list
+        '''
+        end = time.time() + timeout
+        while timeout < 0 or time.time() < end:
+            for c in self._containers:
+                if c.status in any_list:
+                    return c.status
+
+            s = [c.status for c in self._containers]
+            if len(set(s)) == 1 and s[0] in all_list:
+                return s[0]
+
+            time.sleep(interval)
diff --git a/python/paddle/distributed/launch/job/status.py b/python/paddle/distributed/launch/job/status.py
new file mode 100644
index 0000000000000..ae10c5adb6cbf
--- /dev/null
+++ b/python/paddle/distributed/launch/job/status.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Status(object):
+    UNINIT = "uninit"
+    READY = "ready"
+    RUNNING = "running"
+    FAILED = "failed"
+    TERMINATING = "terminating"
+    RESTARTING = "restarting"
+    UNKNOWN = "unknown"
+    COMPLETED = "completed"
diff --git a/python/paddle/distributed/launch/plugins/__init__.py b/python/paddle/distributed/launch/plugins/__init__.py
new file mode 100644
index 0000000000000..1862f75a77f65
--- /dev/null
+++ b/python/paddle/distributed/launch/plugins/__init__.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+
+__all__ = []
+
+
+def log(ctx):
+    ctx.logger.info("-----------  Configuration  ----------------------")
+    for arg, value in sorted(six.iteritems(vars(ctx.args))):
+        ctx.logger.info("%s: %s" % (arg, value))
+    ctx.logger.info("--------------------------------------------------")
+
+
+def process_args(ctx):
+    # reset device by args
+    #argdev = ctx.args.gpus or ctx.args.xpus or ctx.args.npus
+    argdev = ctx.args.devices
+    if argdev:
+        ctx.node.device.labels = argdev.split(',')
+        ctx.logger.debug('Device reset by args {}'.format(argdev))
+
+
+def collective_compatible(ctx):
+    if 'PADDLE_TRAINER_ENDPOINTS' in ctx.envs:
+        eps = ctx.envs['PADDLE_TRAINER_ENDPOINTS'].split(',')
+        hosts = set([h.split(':')[0] for h in eps])
+        ctx.args.master = eps[0] if ':' in eps[0] else '{}:6768'.format(eps[0])
+        ctx.args.nnodes = len(hosts)
+        ctx.logger.info('args reset by env PADDLE_TRAINER_ENDPOINTS\n{}'.format(
+            eps))
+    '''
+    if 'DISTRIBUTED_TRAINER_ENDPOINTS' in ctx.envs:
+        eps = ctx.envs['DISTRIBUTED_TRAINER_ENDPOINTS'].split(',')
+        hosts = set([h.split(':')[0] for h in eps])
+        ctx.args.master = eps[0]
+        ctx.args.nnodes = len(hosts)
+        ctx.logger.info(
+            'args reset by env DISTRIBUTED_TRAINER_ENDPOINTS\n{}'.format(eps))
+    '''
+
+
+def rewrite_host_ip(ctx):
+    if ctx.args.host is not None and "." in ctx.args.host:
+        ctx.logger.warning('Host ip reset to {}'.format(ctx.args.host))
+        ctx.node.ip = ctx.args.host
+
+
+enabled_plugins = [collective_compatible, rewrite_host_ip, process_args, log]
diff --git a/python/paddle/distributed/launch/utils/__init__.py b/python/paddle/distributed/launch/utils/__init__.py
new file mode 100644
index 0000000000000..97043fd7ba688
--- /dev/null
+++ b/python/paddle/distributed/launch/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/launch/utils/kv_client.py b/python/paddle/distributed/launch/utils/kv_client.py
new file mode 100644
index 0000000000000..e19195412268a
--- /dev/null
+++ b/python/paddle/distributed/launch/utils/kv_client.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import time
+
+
+class KVClient(object):
+    def __init__(self, endpoint='localhost:2379'):
+        self.endpoint = endpoint if endpoint.startswith(
+            "http://") else "http://{}".format(endpoint)
+
+    def put(self, key, value):
+        key = key if key.startswith('/') else "/{}".format(key)
+        u = "{}{}".format(self.endpoint, key)
+        try:
+            r = requests.post(u, data=value, timeout=3)
+            if r.status_code == 200:
+                return True
+            else:
+                return False
+        except:
+            return False
+
+    def get(self, key):
+        key = key if key.startswith('/') else "/{}".format(key)
+        u = "{}{}".format(self.endpoint, key)
+        try:
+            r = requests.get(u, timeout=3)
+            if r.status_code == 200:
+                ret = r.json()
+                return ret.get(key, '')
+            else:
+                return "error"
+        except:
+            return ""
+
+    def get_prefix(self, key):
+        key = key if key.startswith('/') else "/{}".format(key)
+        u = "{}{}".format(self.endpoint, key)
+        try:
+            r = requests.get(u, timeout=3)
+            if r.status_code == 200:
+                return r.json()
+        except:
+            return ""
+
+    def delete(self, key):
+        key = key if key.startswith('/') else "/{}".format(key)
+        u = "{}{}".format(self.endpoint, key)
+        try:
+            r = requests.delete(u, timeout=3)
+            if r.status_code == 200:
+                return True
+            else:
+                return False
+        except:
+            return False
+
+    def wait_server_ready(self, timeout=3):
+        end = time.time() + timeout
+        while time.time() < end:
+            if self.get("/healthy") == "ok":
+                return True
+
+
+if __name__ == '__main__':
+    cli = PKVClient("http://localhost:8090")
+    data = {"/workers/1": "rank1", "/workers/2": "rank2"}
+    for k, v in data.items():
+        cli.put(k, v)
+    x = cli.get_prefix("/workers")
+    print(x)
+    for k, v in data.items():
+        assert x[k] == v
+
+    cli.put("key", "value")
+    print(cli.get("key"))
+    assert cli.get("key") == "value"
+    cli.delete("key")
+    print(cli.get("/key"))
+    print(cli.get("/healthy"))
+    assert cli.get("/healthy") == "ok"
diff --git a/python/paddle/distributed/launch/utils/kv_server.py b/python/paddle/distributed/launch/utils/kv_server.py
new file mode 100644
index 0000000000000..2d7ae15f13d63
--- /dev/null
+++ b/python/paddle/distributed/launch/utils/kv_server.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from http.server import HTTPServer
+import http.server as SimpleHTTPServer
+
+from multiprocessing import Process
+
+import threading
+import json
+
+
+class KVHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
+    def do_GET(self):
+        with self.server.kv_lock:
+            ret = {}
+            for k, v in self.server.kv.items():
+                if k.startswith(self.path):
+                    ret[k] = v.decode(encoding="utf-8")
+            if ret:
+                self.output(200, json.dumps(ret).encode("utf-8"))
+            else:
+                self.output(404)
+
+    def do_PUT(self):
+        self.do_POST()
+
+    def do_POST(self):
+        content_length = int(self.headers['Content-Length'] or 0)
+        try:
+            value = self.rfile.read(content_length)
+            with self.server.kv_lock:
+                self.server.kv[self.path] = value
+                self.output(200)
+                return
+        except:
+            self.output(500)
+
+    def do_DELETE(self):
+        with self.server.kv_lock:
+            if self.path in self.server.kv:
+                del self.server.kv[self.path]
+                self.output(200)
+            else:
+                self.output(404)
+
+    def output(self, code, value=''):
+        self.send_response(code)
+        self.send_header("Content-Length", len(value))
+        self.send_header("Content-Type", "application/json; charset=utf8")
+        self.end_headers()
+        if value:
+            self.wfile.write(value)
+
+    def log_message(self, format, *args):
+        return
+
+
+class KVServer(HTTPServer, object):
+    def __init__(self, port):
+        super(KVServer, self).__init__(('', port), KVHandler)
+        self.kv_lock = threading.Lock()
+        self.kv = {'/healthy': b'ok'}
+        self.port = port
+        self.stopped = False
+        self.started = False
+
+    def start(self):
+        self.listen_thread = threading.Thread(target=self.serve_forever)
+        self.listen_thread.start()
+        self.started = True
+
+    def stop(self):
+        self.shutdown()
+        self.listen_thread.join()
+        self.server_close()
+        self.stopped = True
+
+
+class PKVServer():
+    def __init__(self, port):
+        self._server = KVServer(port)
+
+    def start(self):
+        self.proc = Process(target=self._server.start)
+        self.proc.daemon = True
+        self.proc.start()
+
+    def stop(self):
+        self._server.stop()
+        self.proc.join()
+
+    @property
+    def started(self):
+        return self._server.started
+
+    @property
+    def stopped(self):
+        return self._server.stopped
+
+
+if __name__ == '__main__':
+    #kv = PKVServer(8090)
+    kv = KVServer(8090)
+    kv.start()
+    import time
+
+    #print("serve at 8090 for 600 s")
+
+    time.sleep(600)
diff --git a/python/paddle/distributed/launch/utils/process_context.py b/python/paddle/distributed/launch/utils/process_context.py
new file mode 100644
index 0000000000000..4d6fa8de794ff
--- /dev/null
+++ b/python/paddle/distributed/launch/utils/process_context.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+import os, sys, signal, time
+
+
+class ProcessContext(object):
+    def __init__(self,
+                 cmd,
+                 env=os.environ,
+                 out=sys.stdout,
+                 err=sys.stderr,
+                 group=True,
+                 preexec_fn=None):
+        self._cmd = cmd
+        self._env = env
+        self._preexec_fn = preexec_fn
+        self._stdout = out
+        self._stderr = err
+        self._group = group if os.name != 'nt' else False
+        self._proc = None
+        self._code = None
+
+    def _start(self):
+        pre_fn = os.setsid if self._group else None
+        self._proc = subprocess.Popen(
+            self._cmd,
+            env=self._env,
+            stdout=self._stdout,
+            stderr=self._stderr,
+            preexec_fn=self._preexec_fn or pre_fn)
+
+    def _close_std(self):
+        try:
+            if not self._stdout.isatty():
+                self._stdout.close()
+
+            if not self._stderr.isatty():
+                self._stderr.close()
+        except:
+            pass
+
+    def alive(self):
+        return self._proc and self._proc.poll() is None
+
+    def exit_code(self):
+        return self._proc.poll() if self._proc else None
+
+    def start(self):
+        self._start()
+
+    def terminate(self, force=False, max_retry=3):
+        for i in range(max_retry):
+            if self.alive():
+                if self._group:
+                    os.killpg(os.getpgid(self._proc.pid), signal.SIGTERM)
+                else:
+                    self._proc.terminate()
+                time.sleep(0.2)
+            else:
+                break
+
+        if force and self.alive():
+            self._proc.kill()
+
+        self._close_std()
+
+        return self.alive()
+
+    def wait(self, timeout=None):
+        self._proc.wait(timeout)
diff --git a/python/paddle/distributed/models/__init__.py b/python/paddle/distributed/models/__init__.py
new file mode 100644
index 0000000000000..e1663029ef1f8
--- /dev/null
+++ b/python/paddle/distributed/models/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/models/moe/__init__.py b/python/paddle/distributed/models/moe/__init__.py
new file mode 100644
index 0000000000000..e1663029ef1f8
--- /dev/null
+++ b/python/paddle/distributed/models/moe/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/models/moe/utils.py b/python/paddle/distributed/models/moe/utils.py
new file mode 100644
index 0000000000000..fd98c64318c60
--- /dev/null
+++ b/python/paddle/distributed/models/moe/utils.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid import core
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.framework import in_dygraph_mode
+
+
+def _number_count(gate_idx, upper_range):
+    """
+    calculate the expert count according to the gate index.
+    Args:
+        gate_idx (Tensor): Tensor. The input gate index whose data type should be int32 or int64.
+        upper_range (int): The number of the experts.
+    Returns:
+        out (Tensor): The output expert count.
+    Examples:
+        .. code-block:: python
+            # required: distributed
+            import paddle
+
+            gate_idx = [
+                [0, 2],
+                [0, 2]
+            ]
+            upper_range = 6
+            gate_idx = paddle.to_tensor(gate_idx, dtype="int32")
+            number_count = paddle.distributed.utils.number_count(gate_idx, upper_range)
+            print(number_count) # the result: [2, 0, 2, 0, 0, 0]
+    """
+    if in_dygraph_mode():
+        return core.ops.number_count(gate_idx, 'upper_range', upper_range)
+    else:
+        op_type = 'number_count'
+
+        helper = LayerHelper(op_type, **locals())
+        out = helper.create_variable_for_type_inference(dtype=gate_idx.dtype)
+
+        helper.append_op(
+            type=op_type,
+            inputs={'gate_idx': gate_idx},
+            outputs={'Out': out},
+            attrs={'upper_range': upper_range})
+        return out
diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py
index 2fdb20600f673..6fd4caa7b4a5c 100644
--- a/python/paddle/distributed/sharding/group_sharded.py
+++ b/python/paddle/distributed/sharding/group_sharded.py
@@ -39,19 +39,20 @@ def group_sharded_parallel(model,
                            segment_size=2**20,
                            sync_comm=False):
     """
-    Use this module to configure and wrap up the parameters of the group shared module.
+    Use group_sharded_parallel can perform group shared configuration on the model, optimizer and GradScaler. Level has three string options, 'os', 'os_g' and 'p_g_os' corresponds to three different usage scenarios: optimizer state segmentation, optimizer state + gradient segmentation, and parameter + gradient + optimizer state segmentation.
+    Usually, optimizer state + gradient segmentation is actually a re optimization of optimizer state segmentation, so optimizer state + gradient segmentation can be used to realize optimizer state segmentation.
 
     Args:
         model (Layer): The layer to be wrapped with group_sharded_parallel.
         optimizer (Optimizer): The optimizer to be wrapped with group_sharded_parallel.
         level (str): The different level of the group sharded. Such as `os`, `os_g`, `p_g_os`.
-        scaler (GradScaler, optional): The scaler to be wrapped with group_sharded_parallel. Defaults to None.
-        group (Group, optional): The group instance. Defaults to None.d
-        offload (bool, optional): Whether to perform optimizer state and gradient transfer CPU. Defaults to False.
-        sync_buffers (bool, optional): Whether to broadcast model buffers. Defaults to False.
-        buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. Defaults to 2**23.
-        segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20.
-        sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False.
+        scaler (GradScaler, optional): If AMP is used, you need to pass GradScaler. Defaults to None, indicating that GradScaler is not used.
+        group (Group, optional): The group instance. Defaults to None, indicating that the default environment group is used.
+        offload (bool, optional): Whether to use the offload function. Defaults to False, which means that the offload function is not used.
+        sync_buffers (bool, optional): Whether to broadcast model buffers. It is generally used when there are registered model buffers. Defaults to False, indicating that model buffers are not used.
+        buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. The larger the size, the more GPU memory will be used. Defaults to 2**23, which means that the dimension of the buffer is 2**23.
+        segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20, indicating that the dimension of the minimum segmented parameter is 2**20.
+        sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False, indicating that asynchronous communication is used.
     
     Returns:
         model: A wrapper for group sharded given model.
@@ -101,7 +102,7 @@ def group_sharded_parallel(model,
     def check_dtype(param):
         return param.dtype == paddle.float16
 
-    params_fp16 = filter(check_dtype, model.parameters())
+    params_fp16 = list(filter(check_dtype, model.parameters()))
     if scaler is None and len(params_fp16) > 0:
         raise ValueError("Please enter the correct scaler.")
     # convert model/optimizer/scaler
@@ -146,10 +147,13 @@ def save_group_sharded_model(model, output, optimizer=None):
     """
     Group sharded encapsulated model and optimizer state saving module.
 
+    .. note::
+        If using save_group_sharded_model saves the model. When loading again, you need to set the model or optimizer state before using group_sharded_parallel.
+
     Args:
         model (Layer): A wrapper for group sharded given model.
         output (str): Save directory.
-        optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None.
+        optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None, indicating that the optimizer state is not saved.
     
     Examples:
         .. code-block:: python
@@ -182,7 +186,7 @@ def save_group_sharded_model(model, output, optimizer=None):
             optimizer.clear_grad()
 
             # save model and optimizer state_dict
-            save_group_sharded_model(model, optimizer，output=output_dir)
+            save_group_sharded_model(model, optimizer, output=output_dir)
     """
     logger_.info(
         "==========Begin to save group sharded model and optimizer==========")
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 997075590e5cf..fb9e8d8ece100 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -226,3 +226,7 @@ def remove_flag_if_exists(name):
     atexit.register(core.npu_finalize)
 # NOTE(Aurelius84): clean up ExecutorCacheInfo in advance manually.
 atexit.register(core.clear_executor_cache)
+# NOTE(Aganlengzi): clean up KernelFactory in advance manually.
+atexit.register(core.clear_kernel_factory)
+# NOTE(wangran16): clean up DeviceManger in advance manually.
+atexit.register(core.clear_device_manager)
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index b8a696057e780..d21b7e4740a6e 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -542,7 +542,7 @@ def __init__(self):
     def set_graph_config(self,
                          num_ipus=1,
                          is_training=True,
-                         batch_size=1,
+                         micro_batch_size=1,
                          enable_manual_shard=False):
         """
         Set graph configuration to the IpuStrategy instance.
@@ -571,7 +571,7 @@ def set_graph_config(self,
                 ipu_strategy = static.IpuStrategy()
                 ipu_strategy.set_graph_config(num_ipus=1,
                                             is_training=True,
-                                            batch_size=1,
+                                            micro_batch_size=1,
                                             enable_manual_shard=False)
         """
         if num_ipus == 1 and enable_manual_shard:
@@ -581,7 +581,7 @@ def set_graph_config(self,
         options = {
             'num_ipus': num_ipus,
             'is_training': is_training,
-            'micro_batch_size': batch_size,
+            'micro_batch_size': micro_batch_size,
             'enable_manual_shard': enable_manual_shard,
         }
         self.set_options(options)
@@ -589,6 +589,7 @@ def set_graph_config(self,
     def set_pipelining_config(self,
                               enable_pipelining=False,
                               batches_per_step=1,
+                              enable_gradient_accumulation=False,
                               accumulation_factor=1):
         """
         Set pipelining configuration to the IpuStrategy instance. Used to optimize the throughput performance.
@@ -598,6 +599,8 @@ def set_pipelining_config(self,
                 Default False, which means disabled.
             batches_per_step (int, optional): Set the batches per run in data pipelining mode. Only if enable_pipelining=True, batches_per_step is able to be set > 1.
                 Default 1, which means no data pipelining.
+            enable_gradient_accumulation (bool, optional): Enable to accumulate gradients before updating the weights in training mode. Only if enable_pipelining=True,
+                enable_gradient_accumulation is able to be set True. Default False, which means no gradient accumulation. 
             accumulation_factor (int, optional): Specify the number of micro-batches to accumulate 
                 before applying the varUpdate. Default 1, which means disable the accumulation.
         
@@ -617,6 +620,7 @@ def set_pipelining_config(self,
                 ipu_strategy = static.IpuStrategy()
                 ipu_strategy.set_pipelining_config(enable_pipelining=False,
                                                     batches_per_step=1,
+                                                    enable_gradient_accumulation=False,
                                                     accumulation_factor=1)
         """
         enable_manual_shard = self.get_option('enable_manual_shard')
@@ -627,6 +631,7 @@ def set_pipelining_config(self,
         options = {
             'enable_pipelining': enable_pipelining,
             'batches_per_step': batches_per_step,
+            'enable_gradient_accumulation': enable_gradient_accumulation,
             'accumulation_factor': accumulation_factor,
         }
         self.set_options(options)
@@ -754,6 +759,56 @@ def get_option(self, option):
         """
         return self._ipu_strategy.get_option(option)['value']
 
+    def enable_pattern(self, pattern):
+        """
+        Enable PopART pattern to optimize the graph.
+
+        Args:
+            pattern(string): the name of the pattern.
+        
+        Returns:
+            None.
+
+        Examples:
+            .. code-block:: python
+
+                # required: ipu
+
+                import paddle
+                import paddle.static as static
+
+                paddle.enable_static()
+
+                ipu_strategy = static.IpuStrategy()
+                ipu_strategy.enable_pattern("ViewSimplifyPattern")
+        """
+        self._ipu_strategy.enable_pattern(pattern)
+
+    def disable_pattern(self, pattern):
+        """
+        Disable PopART pattern.
+
+        Args:
+            pattern(string): the name of the pattern.
+        
+        Returns:
+            None.
+
+        Examples:
+            .. code-block:: python
+
+                # required: ipu
+
+                import paddle
+                import paddle.static as static
+
+                paddle.enable_static()
+
+                ipu_strategy = static.IpuStrategy()
+                ipu_strategy.disable_pattern("ViewSimplifyPattern")
+        """
+        self._ipu_strategy.disable_pattern(pattern)
+
     @property
     def num_ipus(self):
         """
@@ -817,8 +872,8 @@ class IpuCompiledProgram(object):
             main_prog = static.default_main_program()
             
             ipu_strategy = static.IpuStrategy()
-            ipu_strategy.set_graph_config(num_ipus=1, is_training=True, batch_size=1)
-            ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, accumulation_factor=1)
+            ipu_strategy.set_graph_config(num_ipus=1, is_training=True, micro_batch_size=1)
+            ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, enable_gradient_accumulation=False, accumulation_factor=1)
             ipu_strategy.set_precision_config(enable_fp16=False)
             
             ipu_compiled_program = static.IpuCompiledProgram(
@@ -891,8 +946,8 @@ def compile(self, feed_list, fetch_list):
                 main_prog = static.default_main_program()
 
                 ipu_strategy = static.IpuStrategy()
-                ipu_strategy.set_graph_config(num_ipus=1, is_training=True, batch_size=1)
-                ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, accumulation_factor=1)
+                ipu_strategy.set_graph_config(num_ipus=1, is_training=True, micro_batch_size=1)
+                ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, enable_gradient_accumulation=False, accumulation_factor=1)
                 ipu_strategy.set_precision_config(enable_fp16=False)
                 
                 program = static.IpuCompiledProgram(
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
index 80d2ccb0d5ca6..9dba5d658dfc9 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -173,6 +173,9 @@ def _update_list(self):
 elif core.is_compiled_with_npu():
     _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
         'NPU', core.VarDesc.VarType.FP16)
+elif core.is_compiled_with_mlu():
+    _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
+        'MLU', core.VarDesc.VarType.FP16)
 else:
     _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
         'GPU', core.VarDesc.VarType.FP16)
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 9da798375af25..d614630b3db12 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -272,7 +272,7 @@ def __init__(self,
         ]
         self._support_weight_quantize_type = ['abs_max', 'channel_wise_abs_max']
         self._support_algo_type = [
-            'KL', 'hist', 'avg', 'mse', 'abs_max', 'min_max'
+            'KL', 'hist', 'avg', 'mse', 'emd', 'abs_max', 'min_max'
         ]
         self._dynamic_quantize_op_type = ['lstm']
         self._support_quantize_op_type = \
@@ -349,7 +349,7 @@ def __init__(self,
         # The vars for algo = avg
         self._quantized_var_avg = {}
         # The best loss of algo = mse
-        self._best_mse_loss = {}
+        self._best_calibration_loss = {}
         # The threshold for algo = abs_max, mse or avg
         self._quantized_threshold = {}
 
@@ -408,7 +408,7 @@ def quantize(self):
                 np.array(self._quantized_var_avg[var_name]).mean()
         if self._algo in ["KL", "hist"]:
             self._calculate_kl_hist_threshold()
-        if self._algo in ["KL", "abs_max", "hist", "avg", "mse"]:
+        if self._algo in ["KL", "abs_max", "hist", "avg", "mse", "emd"]:
             self._update_program()
         else:
             self._save_input_threhold()
@@ -582,6 +582,8 @@ def _sampling(self):
             self._sample_min_max()
         elif self._algo == "mse":
             self._sample_mse()
+        elif self._algo == "emd":
+            self._sample_emd()
         elif self._algo in ["KL", "hist"]:
             self._sample_histogram()
 
@@ -610,8 +612,8 @@ def _sample_mse(self):
             abs_max_value = float(np.max(np.abs(var_tensor)))
             abs_max_value = 1e-8 if abs_max_value == 0.0 else abs_max_value
             s = 0.3
-            if var_name not in self._best_mse_loss:
-                self._best_mse_loss[var_name] = float('inf')
+            if var_name not in self._best_calibration_loss:
+                self._best_calibration_loss[var_name] = float('inf')
             while s <= 1.0:
                 scale = s * abs_max_value
                 s += 0.02
@@ -620,8 +622,49 @@ def _sample_mse(self):
                     np.clip(var_tensor, 0.0, scale) / scale *
                     bins) / bins * scale
                 mse_loss = ((var_tensor - quant_dequant_var)**2).mean()
-                if mse_loss <= self._best_mse_loss[var_name]:
-                    self._best_mse_loss[var_name] = mse_loss
+                if mse_loss <= self._best_calibration_loss[var_name]:
+                    self._best_calibration_loss[var_name] = mse_loss
+                    self._quantized_threshold[var_name] = scale
+
+    def _sample_emd(self):
+        if self._quantized_threshold == {}:
+            for var_name in self._quantized_weight_var_name:
+                var_tensor = _load_variable_data(self._scope, var_name)
+                if self._weight_quantize_type == "abs_max":
+                    abs_max_value = float(np.max(np.abs(var_tensor)))
+                elif self._weight_quantize_type == "channel_wise_abs_max":
+                    abs_max_value = []
+                    if self._weight_op_pairs[
+                            var_name] in _channelwise_quant_axis1_ops:
+                        for i in range(var_tensor.shape[1]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[:, i]))))
+                    else:
+                        for i in range(var_tensor.shape[0]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[i]))))
+                self._quantized_threshold[var_name] = abs_max_value
+        _logger.info("EMD searching stage ...")
+        for var_name in self._quantized_act_var_name:
+            var_tensor = _load_variable_data(self._scope, var_name)
+            var_tensor = var_tensor.flatten()
+            abs_max_value = float(np.max(np.abs(var_tensor)))
+            abs_max_value = 1e-8 if abs_max_value == 0.0 else abs_max_value
+            s = 0.3
+            if var_name not in self._best_calibration_loss:
+                self._best_calibration_loss[var_name] = float('inf')
+            while s <= 1.0:
+                scale = s * abs_max_value
+                s += 0.02
+                bins = 2**(self._activation_bits - 1) - 1
+                quant_dequant_var = np.round(
+                    np.clip(var_tensor, 0.0, scale) / scale *
+                    bins) / bins * scale
+                emd_loss = np.abs(
+                    np.mean(var_tensor) - np.mean(quant_dequant_var)) + np.abs(
+                        np.std(var_tensor) - np.std(quant_dequant_var))
+                if emd_loss <= self._best_calibration_loss[var_name]:
+                    self._best_calibration_loss[var_name] = emd_loss
                     self._quantized_threshold[var_name] = scale
 
     def _sample_avg(self):
@@ -936,8 +979,6 @@ def analysis_and_save_info(op_node, out_var_name):
                 if op.type in (
                         self._quantizable_op_type + self._out_scale_op_list):
                     out_var_names = _get_op_output_var_names(op)
-                    assert len(out_var_names) == 1, "Post training " + \
-                        "quantization only support one output for " + op.type
                     for var_name in out_var_names:
                         analysis_and_save_info(op, var_name)
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index efa000274d01a..afca617b6dd82 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -59,6 +59,7 @@
     "tanh",
     "prelu",
     "swish",
+    "dropout",
     "softmax",
     "batch_norm",
     "layer_norm",
@@ -68,6 +69,8 @@
     "transpose2",
     "concat",
     "elementwise_mul",
+    "elementwise_pow",
+    "elementwise_sub",
     "scale",
     "slice",
     "hard_swish",
@@ -81,8 +84,54 @@
     "flatten2",
     "transpose",
     "pad2d",
+    "pad3d",
     "reshape",
-    "layer_norm",
+    "split",
+    "flatten_contiguous_range",
+    "squeeze",
+    "squeeze2",
+    "nearest_interp_v2",
+    "fill_constant_batch_size_like",
+    "bilinear_interp",
+    "bilinear_interp_v2",
+    "arg_max",
+    "abs",
+    "assign",
+    "cast",
+    "clip",
+    "box_coder",
+    "crop",
+    "cumsum",
+    "equal",
+    "expand_v2",
+    "fill_any_like",
+    "fill_constant",
+    "gelu",
+    "instance_norm",
+    "lookup_table",
+    "lookup_table_v2",
+    "norm",
+    "p_norm",
+    "pow",
+    "reduce_mean",
+    "stack",
+    "top_k_v2",
+    "unsqueeze",
+    "unsqueeze2",
+    "logical_and",
+    "logical_not",
+    "meshgrid",
+    "roi_align",
+    "strided_slice",
+    "where",
+    "grid_sampler",
+    "tile",
+    "group_norm",
+    "reduce_sum",
+    "square",
+    "softplus",
+    "gather",
+    "shuffle_channel",
 ]
 
 # list op real input and output names, to avoid processing input such as AxisTensor.
@@ -119,7 +168,7 @@
     "relu": [["X"], ["Out"]],
     "relu6": [["X"], ["Out"]],
     "leaky_relu": [["X"], ["Out"]],
-    "prelu": [["X"], ["Out"]],
+    "prelu": [["X", "Alpha"], ["Out"]],
     "tanh": [["X"], ["Out"]],
     "swish": [["X"], ["Out"]],
     "dropout": [["X"], ["Out"]],
@@ -127,16 +176,59 @@
     "layer_norm": [["X"], ["Y"]],
     "sigmoid": [["X"], ["Out"]],
     "elementwise_mul": [["X", "Y"], ["Out"]],
+    "elementwise_pow": [["X", "Y"], ["Out"]],
     "scale": [["X"], ["Out"]],
     "hard_swish": [["X"], ["Out"]],
     "hard_sigmoid": [["X"], ["Out"]],
     "gru": [["Input", "Weight"], ["Hidden"]],
     "lstm": [["Input", "Weight"], ["Hidden"]],
     "pad2d": [["X"], ["Out"]],
+    "pad3d": [["X"], ["Out"]],
     "flatten": [["X"], ["Out"]],
     "flatten2": [["X"], ["Out"]],
     "unsqueeze2": [["X"], ["Out"]],
-    "flatten_contiguous_range": [['X'], ["Out"]],
+    "unsqueeze2": [["X"], ["Out"]],
+    "flatten_contiguous_range": [["X"], ["Out"]],
+    "split": [["X"], ["Out"]],
+    "squeeze2": [["X"], ["Out"]],
+    "nearest_interp_v2": [["X"], ["Out"]],
+    "bilinear_interp": [["X"], ["Out"]],
+    "bilinear_interp_v2": [["X"], ["Out"]],
+    "fill_constant_batch_size_like": [["Input"], ["Out"]],
+    "arg_max": [["X"], ["Out"]],
+    "abs": [["X"], ["Out"]],
+    "assign": [["X"], ["Out"]],
+    "cast": [["X"], ["Out"]],
+    "clip": [["X"], ["Out"]],
+    "box_coder": [["PriorBox"], ["OutputBox"]],
+    "crop": [["X"], ["Out"]],
+    "cumsum": [["X"], ["Out"]],
+    "expand_v2": [["X"], ["Out"]],
+    "fill_any_like": [["X"], ["Out"]],
+    "fill_constant": [[], ["Out"]],
+    "gelu": [["X"], ["Out"]],
+    "instance_norm": [["X"], ["Out"]],
+    "lookup_table": [["W", "Ids"], ["Out"]],
+    "lookup_table_v2": [["W", "Ids"], ["Out"]],
+    "norm": [["X"], ["Norm"]],
+    "p_norm": [["X"], ["Out"]],
+    "pow": [["X"], ["Out"]],
+    "reduce_mean": [["X"], ["Out"]],
+    "stack": [["X"], ["Y"]],
+    "top_k_v2": [["X"], ["Out", "Indices"]],
+    "logical_and": [["X", "Y"], ["Out"]],
+    "logical_not": [["X"], ["Out"]],
+    "meshgrid": [["X"], ["Out"]],
+    "roi_align": [["X", "ROIs"], ["Out"]],
+    "strided_slice": [["Input"], ["Out"]],
+    "where": [["Condition", "X", "Y"], ["Out"]],
+    "grid_sampler": [["X", "Grid"], ["Output"]],
+    "tile": [["X"], ["Out"]],
+    "group_norm": [["X"], ["Y", "Mean", "Variance"]],
+    "reduce_sum": [["X"], ["Out"]],
+    "square": [["X"], ["Out"]],
+    "softplus": [["X"], ["Out"]],
+    "shuffle_channel": [["X"], ["Out"]],
 }
 
 _conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose']
@@ -1797,14 +1889,93 @@ class AddQuantDequantPass(object):
     quantized ops's inputs.
     """
     _supported_quantizable_op_type = [
-        "pool2d", "elementwise_add", "concat", "softmax", "argmax", "transpose",
-        "equal", "gather", "greater_equal", "greater_than", "less_equal",
-        "less_than", "mean", "not_equal", "reshape", "reshape2",
-        "bilinear_interp", "nearest_interp", "trilinear_interp", "slice",
-        "squeeze", "elementwise_sub", "mul", "matmul", "relu", "relu6",
-        "leaky_relu", "tanh", "swish", "scale", "transpose", "transpose2",
-        "sigmoid", "pad2d", "flatten", "flatten2", "batch_norm", "layer_norm",
-        "matmul_v2"
+        "pool2d",
+        "elementwise_add",
+        "concat",
+        "softmax",
+        "argmax",
+        "transpose",
+        "equal",
+        "gather",
+        "greater_equal",
+        "greater_than",
+        "less_equal",
+        "less_than",
+        "mean",
+        "not_equal",
+        "reshape",
+        "reshape2",
+        "dropout",
+        "bilinear_interp",
+        "nearest_interp",
+        "trilinear_interp",
+        "slice",
+        "squeeze",
+        "elementwise_sub",
+        "mul",
+        "matmul",
+        "relu",
+        "relu6",
+        "leaky_relu",
+        "tanh",
+        "swish",
+        "scale",
+        "transpose",
+        "transpose2",
+        "sigmoid",
+        "pad2d",
+        "flatten",
+        "flatten2",
+        "batch_norm",
+        "layer_norm",
+        "matmul_v2",
+        "split",
+        "flatten_contiguous_range",
+        "squeeze2",
+        "nearest_interp_v2",
+        "bilinear_interp",
+        "bilinear_interp_v2",
+        "fill_constant_batch_size_like",
+        "arg_max",
+        "abs",
+        "assign",
+        "cast",
+        "clip",
+        "box_coder",
+        "crop",
+        "cumsum",
+        "elementwise_mul",
+        "elementwise_pow",
+        "expand_v2",
+        "fill_any_like",
+        "fill_constant",
+        "gelu",
+        "hard_sigmoid",
+        "hard_swish",
+        "instance_norm",
+        "lookup_table",
+        "lookup_table_v2",
+        "norm",
+        "p_norm",
+        "pad3d",
+        "pow",
+        "prelu",
+        "reduce_mean",
+        "unsqueeze",
+        "unsqueeze2",
+        "logical_and",
+        "logical_not",
+        "meshgrid",
+        "roi_align",
+        "strided_slice",
+        "where",
+        "grid_sampler",
+        "tile",
+        "group_norm",
+        "reduce_sum",
+        "square",
+        "softplus",
+        "shuffle_channel",
     ]
 
     # To be compatible with PaddleSlim, not remove _activation_type for now
diff --git a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
index 3fadf25150f9e..f97c2778c0918 100644
--- a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
@@ -52,6 +52,30 @@ def parse_args():
         '--debug',
         action='store_true',
         help='If used, the graph of Quant model is drawn.')
+    parser.add_argument(
+        '--quant_model_filename',
+        type=str,
+        default="",
+        help='The input model`s file name. If empty, search default `__model__` and separate parameter files and use them or in case if not found, attempt loading `model` and `params` files.'
+    )
+    parser.add_argument(
+        '--quant_params_filename',
+        type=str,
+        default="",
+        help='If quant_model_filename is empty, this field is ignored. The input model`s all parameters file name. If empty load parameters from separate files.'
+    )
+    parser.add_argument(
+        '--save_model_filename',
+        type=str,
+        default="__model__",
+        help='The name of file to save the inference program itself. If is set None, a default filename __model__ will be used.'
+    )
+    parser.add_argument(
+        '--save_params_filename',
+        type=str,
+        default=None,
+        help='The name of file to save all related parameters. If it is set None, parameters will be saved in separate files'
+    )
 
     test_args, args = parser.parse_known_args(namespace=unittest)
     return test_args, sys.argv[:1] + args
@@ -61,18 +85,29 @@ def transform_and_save_int8_model(original_path,
                                   save_path,
                                   ops_to_quantize='',
                                   op_ids_to_skip='',
-                                  debug=False):
+                                  debug=False,
+                                  quant_model_filename='',
+                                  quant_params_filename='',
+                                  save_model_filename='',
+                                  save_params_filename=''):
     place = fluid.CPUPlace()
     exe = fluid.Executor(place)
     inference_scope = fluid.executor.global_scope()
     with fluid.scope_guard(inference_scope):
-        if os.path.exists(os.path.join(original_path, '__model__')):
-            [inference_program, feed_target_names,
-             fetch_targets] = fluid.io.load_inference_model(original_path, exe)
+        if not quant_model_filename:
+            if os.path.exists(os.path.join(original_path, '__model__')):
+                [inference_program, feed_target_names,
+                 fetch_targets] = fluid.io.load_inference_model(original_path,
+                                                                exe)
+            else:
+                [inference_program, feed_target_names,
+                 fetch_targets] = fluid.io.load_inference_model(
+                     original_path, exe, 'model', 'params')
         else:
             [inference_program, feed_target_names,
-             fetch_targets] = fluid.io.load_inference_model(original_path, exe,
-                                                            'model', 'params')
+             fetch_targets] = fluid.io.load_inference_model(
+                 original_path, exe, quant_model_filename,
+                 quant_params_filename)
 
         ops_to_quantize_set = set()
         print(ops_to_quantize)
@@ -97,8 +132,14 @@ def transform_and_save_int8_model(original_path,
         graph = transform_to_mkldnn_int8_pass.apply(graph)
         inference_program = graph.to_program()
         with fluid.scope_guard(inference_scope):
-            fluid.io.save_inference_model(save_path, feed_target_names,
-                                          fetch_targets, exe, inference_program)
+            fluid.io.save_inference_model(
+                save_path,
+                feed_target_names,
+                fetch_targets,
+                exe,
+                inference_program,
+                model_filename=save_model_filename,
+                params_filename=save_params_filename)
         print(
             "Success! INT8 model obtained from the Quant model can be found at {}\n"
             .format(save_path))
@@ -109,4 +150,6 @@ def transform_and_save_int8_model(original_path,
     test_args, remaining_args = parse_args()
     transform_and_save_int8_model(
         test_args.quant_model_path, test_args.int8_model_save_path,
-        test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug)
+        test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug,
+        test_args.quant_model_filename, test_args.quant_params_filename,
+        test_args.save_model_filename, test_args.save_params_filename)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
index da5c5d6dc9441..4b70f5b103778 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
@@ -244,6 +244,26 @@ def test_post_training_mse(self):
                       quant_iterations)
 
 
+class TestPostTrainingemdForMnist(TestPostTrainingQuantization):
+    def test_post_training_mse(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "emd"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
 class TestPostTrainingavgForMnist(TestPostTrainingQuantization):
     def test_post_training_avg(self):
         model_name = "mnist_model"
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
index 7161104861006..f83306aca1dc0 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -394,5 +394,27 @@ def test_post_training_abs_max_mobilenetv1(self):
                       diff_threshold)
 
 
+class TestPostTrainingEMDForMobilenetv1(TestPostTrainingQuantization):
+    def test_post_training_avg_mobilenetv1(self):
+        model = "MobileNet-V1"
+        algo = "emd"
+        data_urls = [
+            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+        ]
+        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        quantizable_op_type = [
+            "conv2d",
+            "depthwise_conv2d",
+            "mul",
+        ]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.025
+        self.run_test(model, algo, data_urls, data_md5s, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/sparsity/__init__.py b/python/paddle/fluid/contrib/sparsity/__init__.py
index 9bf45f4272738..ec288a1287119 100644
--- a/python/paddle/fluid/contrib/sparsity/__init__.py
+++ b/python/paddle/fluid/contrib/sparsity/__init__.py
@@ -29,10 +29,11 @@
 from .asp import prune_model
 from .asp import set_excluded_layers
 from .asp import reset_excluded_layers
+from .supported_layer_list import add_supported_layer
 
 __all__ = [
     'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
     'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity',
     'MaskAlgo', 'CheckMethod', 'decorate', 'prune_model', 'set_excluded_layers',
-    'reset_excluded_layers'
+    'reset_excluded_layers', 'add_supported_layer'
 ]
diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py
index ffa12ac704600..30439ad736d26 100644
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
@@ -23,6 +23,8 @@
 from paddle.fluid import global_scope, program_guard, layers
 from paddle.fluid.initializer import ConstantInitializer
 from paddle.fluid.contrib import sparsity
+from paddle.fluid.contrib.sparsity.supported_layer_list import supported_layers_and_prune_func_map
+from paddle.fluid.contrib.sparsity.supported_layer_list import _default_pruning
 from paddle.fluid import core
 
 OpRole = core.op_proto_and_checker_maker.OpRole
@@ -292,8 +294,8 @@ class ASPHelper(object):
     2. pruning well-trained models into 2:4 sparse pattern on FP16 or 1:2 sparse pattern on FP32 for fine-tuning.
     """
 
-    MASK_APPENDDED_NAME = '_asp_mask'
-    SUPPORTED_LAYERS = {'fc': 'w_0', 'linear': 'w_0', 'conv2d': 'w_0'}
+    MASK_APPENDDED_NAME = 'asp_mask'
+    PADDLE_WEIGHT_SUFFIX = "w_"
 
     __asp_info = {}
 
@@ -334,7 +336,6 @@ def prune_model(cls,
         r"""
         This is the implementation of `sparsity.prune_model`, for details please see explanation in `sparsity.prune_model`.
         """
-        checked_func_name = sparsity.CheckMethod.get_checking_method(mask_algo)
 
         if main_program is None:
             main_program = paddle.static.default_main_program()
@@ -345,33 +346,27 @@ def prune_model(cls,
                 weight_tensor = global_scope().find_var(param.name).get_tensor()
                 weight_nparray = np.array(weight_tensor)
 
-                # The double transpose ops here make sure pruning direction consistent with cuSparseLt.
-                # SPMMA in cuSparseLt: D = (AxB) + C, where matrix A (mxk) is sparse matrix.
-                # cuSparseLt would prune matrix A along k dimension.
-                # In sparse training, layer weight matriices is viewed sparse matrix A, so
-                # the math fomula should be 'Act(WX + b)'. However, default fomula in PaddlePaddle
-                #  is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed 
-                # for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension 
-                # of W^T, which is m dimension of W. Moreove, all mask generating functions in 
-                # sparsity/utils is row-major pruning. That is the reason we have to transpose weight 
-                # matrices beforce invoking create_mask. Then we transpose the result maks to make 
-                # sure its shape to be the same as the input weight.
-                weight_sparse_mask = sparsity.create_mask(
-                    weight_nparray.T, func_name=mask_algo, n=n, m=m).T
-                weight_pruned_nparray = np.multiply(weight_nparray,
-                                                    weight_sparse_mask)
+                prune_func = ASPHelper._get_prune_func_by_name(param.name)
+
+                weight_pruned_nparray, weight_sparse_mask = \
+                    prune_func(weight_nparray, m, n, mask_algo, param.name)
+                weight_pruned_nparray = weight_pruned_nparray.astype(
+                    weight_nparray.dtype)
                 weight_tensor.set(weight_pruned_nparray, place)
-                assert sparsity.check_sparsity(weight_pruned_nparray.T,  n=n, m=m, func_name=checked_func_name), \
-                        'Pruning {} weight matrix failure!!!'.format(param.name)
+
                 if with_mask:
                     weight_mask_param = global_scope().find_var(
                         ASPHelper._get_mask_name(param.name))
                     assert weight_mask_param is not None, \
-                        'Cannot find {} variable, please call ASPHelper.minimize' \
+                        'Cannot find {} variable, please call optimizer.minimize (' \
+                        'paddle.sparsity.decorate(optimizer).minimize(loss)' \
                         ' and initialization (exe.run(startup_program)) first!'.format(ASPHelper._get_mask_name(param.name))
                     weight_mask_tensor = weight_mask_param.get_tensor()
+                    weight_sparse_mask = weight_sparse_mask.astype(
+                        np.array(weight_mask_tensor).dtype)
                     weight_mask_tensor.set(weight_sparse_mask, place)
                 asp_info.update_masks(param.name, weight_sparse_mask)
+
         return asp_info.masks.copy()
 
     @staticmethod
@@ -384,7 +379,7 @@ def _get_mask_name(param_name):
         Returns:
             string: The mask name of :attr:`param_name`.
         """
-        return param_name + ASPHelper.MASK_APPENDDED_NAME
+        return param_name + "." + ASPHelper.MASK_APPENDDED_NAME
 
     @staticmethod
     def _get_not_ASP_relevant_vars(main_program):
@@ -434,19 +429,46 @@ def _is_supported_layer(cls, main_program, param_name):
               # fc_0.w_0 -> True
               # fc_0.b_0 -> False
         """
-        if ASPHelper.MASK_APPENDDED_NAME in param_name:
+        param_name_list = param_name.split('.')
+
+        if ASPHelper.MASK_APPENDDED_NAME in param_name_list:
             return False
 
         for layer in cls._get_program_asp_info(main_program).excluded_layers:
             if layer in param_name:
                 return False
 
-        for name in ASPHelper.SUPPORTED_LAYERS:
-            if name in param_name and \
-               ASPHelper.SUPPORTED_LAYERS[name] in param_name:
-                return True
+        if param_name in supported_layers_and_prune_func_map:
+            return True
+
+        param_name_no_weight_suffix = param_name_list[0]
+        param_type_suffix = param_name_list[1]
+        layer_name = param_name_no_weight_suffix[:param_name_no_weight_suffix.
+                                                 rfind('_')]
+        if ASPHelper.PADDLE_WEIGHT_SUFFIX not in param_type_suffix:
+            return False
+
+        if param_name_no_weight_suffix in supported_layers_and_prune_func_map or \
+            layer_name in supported_layers_and_prune_func_map:
+            return True
+
         return False
 
+    @classmethod
+    def _get_prune_func_by_name(cls, param_name):
+        func = supported_layers_and_prune_func_map.get(param_name, None)
+        param_name_no_weight_suffix = param_name.split('.')[0]
+        if func is None:
+            func = supported_layers_and_prune_func_map.get(
+                param_name_no_weight_suffix, None)
+        if func is None:
+            layer_name = param_name_no_weight_suffix[:
+                                                     param_name_no_weight_suffix.
+                                                     rfind('_')]
+            func = supported_layers_and_prune_func_map.get(layer_name,
+                                                           _default_pruning)
+        return func
+
     @classmethod
     def _minimize(cls,
                   optimizer,
@@ -509,8 +531,7 @@ def _create_mask_variables(cls, main_program, startup_program,
                 if ASPHelper._is_supported_layer(main_program,
                                                  param_and_grad[0].name):
                     mask_param = layers.create_parameter(
-                        name=param_and_grad[0].name +
-                        ASPHelper.MASK_APPENDDED_NAME,
+                        name=ASPHelper._get_mask_name(param_and_grad[0].name),
                         shape=param_and_grad[0].shape,
                         dtype=param_and_grad[0].dtype,
                         default_initializer=ConstantInitializer(value=1.0))
diff --git a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
new file mode 100644
index 0000000000000..105c2ded9eee7
--- /dev/null
+++ b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from paddle.fluid.contrib import sparsity
+import threading
+
+__all__ = ['add_supported_layer']
+
+
+def _default_pruning(weight_nparray, m, n, func_name, param_name):
+
+    checked_func_name = sparsity.CheckMethod.get_checking_method(func_name)
+
+    # The double transpose ops here make sure pruning direction consistent with cuSparseLt.
+    # SPMMA in cuSparseLt: D = (AxB) + C, where matrix A (mxk) is sparse matrix.
+    # cuSparseLt would prune matrix A along k dimension.
+    # In sparse training, layer weight matrices is viewed sparse matrix A, so
+    # the math fomula should be 'Act(WX + b)'. However, default fomula in PaddlePaddle
+    #  is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed 
+    # for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension 
+    # of W^T, which is m dimension of W. Moreove, all mask generating functions in 
+    # sparsity/utils is row-major pruning. That is the reason we have to transpose weight 
+    # matrices beforce invoking create_mask. Then we transpose the result mask to make 
+    # sure its shape to be the same as the input weight.
+    weight_sparse_mask = sparsity.create_mask(
+        weight_nparray.T, func_name=func_name, n=n, m=m).T
+    weight_pruned_nparray = np.multiply(weight_nparray, weight_sparse_mask)
+    assert sparsity.check_sparsity(weight_pruned_nparray.T,  n=n, m=m, func_name=checked_func_name), \
+                    'Pruning {} weight matrix failure!!!'.format(param_name)
+    return weight_pruned_nparray, weight_sparse_mask
+
+
+# When value of given key in this DICT is None, 
+# ASP will call default pruning function in pruning stage.
+_supported_layers_and_prune_func_map_lock = threading.Lock()
+supported_layers_and_prune_func_map = {}
+
+
+def add_supported_layer(layer, pruning_func=None):
+    r"""
+    Add supported layers and its corresponding pruning function.
+
+    Args:
+        name (string|Layer): The name or type of layer, needed to support. If layer is `Layer` then 
+        it would be turn to string internally. ASP would use this name to match parameter's name and call 
+        its the corresponding pruning function.
+        pruning_func (function, optional): a function type which receives five argument (weight_nparray,
+        m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight,
+        m, n, and func_name, please see `prune_model` for details.
+    """
+    name = None
+    if isinstance(layer, str):
+        name = layer
+    elif isinstance(layer, paddle.fluid.dygraph.layers.Layer):
+        name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
+            type(layer).__name__)
+    elif issubclass(layer, paddle.fluid.dygraph.layers.Layer):
+        name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
+            layer.__name__)
+    else:
+        assert "The type of layer should be string of Layer, but got {}!".format(
+            type(layer))
+    if pruning_func is None:
+        pruning_func = _default_pruning
+    _supported_layers_and_prune_func_map_lock.acquire()
+    supported_layers_and_prune_func_map.update({name: pruning_func})
+    _supported_layers_and_prune_func_map_lock.release()
+
+
+add_supported_layer('fc')
+add_supported_layer('linear')
+add_supported_layer('conv2d')
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index a962189fab50e..de0201eb2d08a 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -269,9 +269,8 @@ def __next__(self):
                     for i in range(len(data)):
                         data[i] = data[i]._move_to_list()
                     data = [
-                        _restore_batch(d, s)
-                        for d, s in zip(data, self._structure_infos[:len(
-                            self._places)])
+                        _restore_batch(d, s) for d, s in
+                        zip(data, self._structure_infos[:len(self._places)])
                     ]
                     self._structure_infos = self._structure_infos[len(
                         self._places):]
@@ -571,6 +570,14 @@ def _get_data(self):
                     self._rcvd_idx += 1
                     self._batches_outstanding -= 1
                 else:
+                    # NOTE: when _rcvd_idx catch up _send_idx, which means
+                    #       one of following:
+                    #       1. all 2 * num_workers batches have been loaded
+                    #          and stored in _blocking_queue
+                    #       2. all data drained
+                    #       we need to let _thread blocking at _data_queue
+                    #       get_data to inoccupy CPU, otherwise may occupy
+                    #       CPU time for model running
                     # NOTE: in persistent workers mode, do not check data
                     #       drained here, simply let it go to _data_queue
                     #       reading to get _ResumeIteration
@@ -580,7 +587,6 @@ def _get_data(self):
                         #       may also be data in blocking queue
                         if self._batches_outstanding < len(self._places):
                             return None
-                        continue
 
             if self._rcvd_idx in self._task_infos and \
                     len(self._task_infos[self._rcvd_idx]) == 3:
@@ -727,9 +733,8 @@ def __next__(self):
                     for i in range(len(data)):
                         data[i] = data[i]._move_to_list()
                     data = [
-                        _restore_batch(d, s)
-                        for d, s in zip(data, self._structure_infos[:len(
-                            self._places)])
+                        _restore_batch(d, s) for d, s in
+                        zip(data, self._structure_infos[:len(self._places)])
                     ]
                     self._structure_infos = self._structure_infos[len(
                         self._places):]
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index 191661b7bf9d5..4127f1e4449bf 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -271,18 +271,28 @@ def amp_guard(enable=True,
             "current_tracer is None, maybe it is not in imperative mode.")
 
     # check device_type:
-    # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16.
+    # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16, mlu for float16, npu for float16.
     # Maybe we will support cpu for bfloat16.
     if enable and not (tracer._expected_place.is_gpu_place() or
-                       tracer._expected_place.is_xpu_place()):
+                       tracer._expected_place.is_xpu_place() or
+                       tracer._expected_place.is_mlu_place() or
+                       tracer._expected_place.is_npu_place()):
         warnings.warn(
-            'amp_guard can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.'
+            'amp_guard can only be enabled on CUDAPlace, XPUPlace, MLUPlace, and NPUPlace, current place is %s, so it makes no effect.'
             % tracer._expected_place)
         enable = False
+    # For npu:
+    if tracer._expected_place.is_npu_place() and (dtype == 'bfloat16'):
+        warnings.warn('NPUPlace only support float16 amp.')
+        enable = False
     # For xpu:
     if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'):
         warnings.warn('XPUPlace only support float16 amp.')
         enable = False
+    # For mlu:
+    if tracer._expected_place.is_mlu_place() and (dtype == 'bfloat16'):
+        warnings.warn('MLUPlace only support float16 amp.')
+        enable = False
     # For gpu float16: Compute Capability should >= 7.
     # For gpu bfloat16: Compute Capability should >= 8 & CUDA Version should >= 11.
     if tracer._expected_place.is_gpu_place():
diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py
index f7c2d6be574c4..c57290861942b 100644
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -105,9 +105,11 @@ def __init__(self,
                 "current_tracer is None, maybe it is not in imperative mode.")
 
         if enable and not (tracer._expected_place.is_gpu_place() or
-                           tracer._expected_place.is_xpu_place()):
+                           tracer._expected_place.is_xpu_place() or
+                           tracer._expected_place.is_mlu_place() or
+                           tracer._expected_place.is_npu_place()):
             warnings.warn(
-                'AmpScaler can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.'
+                'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace and NPUPlace, current place is %s, so it makes no effect.'
                 % tracer._expected_place)
             enable = False
 
@@ -286,14 +288,28 @@ def _unscale(self, optimizer):
                     ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32
                            )
             ]
-        if len(param_grads_fp16):
-            _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
-                                            param_grads_fp16,
-                                            self._temp_found_inf_fp16)
-        if len(param_grads_fp32):
-            _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
-                                            param_grads_fp32,
-                                            self._temp_found_inf_fp32)
+        if core.is_compiled_with_npu():
+            float_status = _C_ops.alloc_float_status()
+            _C_ops.clear_float_status(float_status, float_status)
+
+            if len(param_grads_fp16):
+                _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
+                                                float_status, param_grads_fp16,
+                                                self._temp_found_inf_fp16)
+            if len(param_grads_fp32):
+                _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
+                                                float_status, param_grads_fp32,
+                                                self._temp_found_inf_fp32)
+        else:
+            if len(param_grads_fp16):
+                _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
+                                                param_grads_fp16,
+                                                self._temp_found_inf_fp16)
+            if len(param_grads_fp32):
+                _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
+                                                param_grads_fp32,
+                                                self._temp_found_inf_fp32)
+
         if len(param_grads_fp16) and len(param_grads_fp32):
             self._found_inf = self._temp_found_inf_fp16 or self._temp_found_inf_fp32
         elif len(param_grads_fp16):
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 8149d69d36a27..b4c5a36d288b7 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -565,16 +565,25 @@ def check_in_out(in_out_list, name):
         if isinstance(in_out_list, (list, tuple)):
             assert len(in_out_list) > 0, "{} cannot be empty".format(name)
             for each_var in in_out_list:
-                assert isinstance(
-                    each_var,
-                    core.VarBase), "Elements of {} must be Variable".format(
-                        name)
+                if core._in_eager_mode():
+                    assert isinstance(
+                        each_var, core.eager.
+                        Tensor), "Elements of {} must be Tensor".format(name)
+                else:
+                    assert isinstance(
+                        each_var,
+                        core.VarBase), "Elements of {} must be Variable".format(
+                            name)
             return in_out_list
         else:
-            assert isinstance(
-                in_out_list,
-                core.VarBase), "{} must be Variable or list of Variable".format(
-                    name)
+            if core._in_eager_mode():
+                assert isinstance(
+                    in_out_list, core.eager.
+                    Tensor), "{} must be Tensor or list of Tensor".format(name)
+            else:
+                assert isinstance(
+                    in_out_list, core.VarBase
+                ), "{} must be Variable or list of Variable".format(name)
             return [in_out_list]
 
     outputs = check_in_out(outputs, 'outputs')
@@ -586,9 +595,14 @@ def check_in_out(in_out_list, name):
 
         for each_var in grad_outputs:
             if each_var is not None:
-                assert isinstance(
-                    each_var, core.VarBase
-                ), "grad_outputs must be None, a Variable or a list containing None or Variables"
+                if core._in_eager_mode():
+                    assert isinstance(
+                        each_var, core.eager.Tensor
+                    ), "grad_outputs must be None, a Variable or a list containing None or Variables"
+                else:
+                    assert isinstance(
+                        each_var, core.VarBase
+                    ), "grad_outputs must be None, a Variable or a list containing None or Variables"
     else:
         grad_outputs = []
 
@@ -598,16 +612,29 @@ def check_in_out(in_out_list, name):
 
     if no_grad_vars is None:
         no_grad_vars = []
-    elif isinstance(no_grad_vars, core.VarBase):
+    elif isinstance(no_grad_vars, (core.VarBase, core.eager.Tensor)):
+        no_grad_vars = [no_grad_vars]
+    elif isinstance(no_grad_vars, core.eager.Tensor):
         no_grad_vars = [no_grad_vars]
     elif isinstance(no_grad_vars, (list, tuple, set)):
         no_grad_vars = list(no_grad_vars)
         for var in no_grad_vars:
-            assert isinstance(
-                var, core.VarBase), "no_grad_vars can only contains Variable"
+            if core._in_eager_mode():
+                assert isinstance(
+                    var,
+                    core.eager.Tensor), "no_grad_vars can only contains Tensor"
+            else:
+                assert isinstance(
+                    var,
+                    core.VarBase), "no_grad_vars can only contains Variable"
     else:
-        raise AssertionError(
-            "no_grad_vars must be None, Variable or list/tuple/set of Variables")
+        if core._in_eager_mode():
+            raise AssertionError(
+                "no_grad_vars must be None, Tensor or list/tuple/set of Tensors")
+        else:
+            raise AssertionError(
+                "no_grad_vars must be None, Variable or list/tuple/set of Variables"
+            )
 
     assert isinstance(create_graph, bool), "create_graph must be True or False"
 
@@ -622,6 +649,11 @@ def check_in_out(in_out_list, name):
     assert isinstance(only_inputs, bool), "only_inputs must be True or False"
     assert only_inputs, "only_inputs=False is not supported yet"
 
+    if core._in_eager_mode():
+        return core.eager.run_partial_grad(
+            outputs, inputs, grad_outputs, retain_graph, create_graph,
+            only_inputs, allow_unused, no_grad_vars)
+
     place = core.Place()
     place.set_place(framework._current_expected_place())
     return core.dygraph_partial_grad(inputs, outputs, grad_outputs,
@@ -686,13 +718,13 @@ def to_variable(value, name=None, zero_copy=None, dtype=None):
             y.shape     # [3L, 2L]
 
     """
-    support_type = (list, tuple, np.ndarray, core.VarBase, framework.Variable,
-                    core.Tensor, core.LoDTensor)
+    support_type = (list, tuple, np.ndarray, core.eager.Tensor, core.VarBase,
+                    framework.Variable, core.Tensor, core.LoDTensor)
     if not isinstance(value, support_type):
         raise TypeError(
             "The type of 'value' in fluid.dygraph.to_variable must be %s, but received %s."
             % (support_type, type(value)))
-    if isinstance(value, (core.VarBase, framework.Variable)):
+    if isinstance(value, (core.eager.Tensor, core.VarBase, framework.Variable)):
         return value
     elif isinstance(value, (core.Tensor, core.LoDTensor)):
         return core.VarBase(value)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
index 30012fb8666fc..900541459f6fc 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
@@ -104,7 +104,7 @@ def _replace_value_with_input_spec(self, args):
             if isinstance(input_var, np.ndarray):
                 input_var = paddle.static.InputSpec.from_numpy(input_var)
                 _set_spec_stop_gradient(input_var, True)
-            elif isinstance(input_var, core.VarBase):
+            elif isinstance(input_var, (core.VarBase, core.eager.Tensor)):
                 stop_gradient = input_var.stop_gradient
                 input_var = paddle.static.InputSpec.from_tensor(input_var)
                 _set_spec_stop_gradient(input_var, stop_gradient)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index a442a8b92b6f7..216f955b75103 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -148,10 +148,7 @@ def __init__(self, main_program, inputs, outputs, parameters=None,
 
         self._origin_main_program = self._verify_program(main_program)
         self._tmp_scope_vec = self._create_scope_vec()
-        # A fake_var to handle empty input or output
-        self.__fake_vars = _create_fake_var()
         # Set default mode to train
-        self._double_grads = self._get_double_grads(self._origin_main_program)
         self.training = True
 
         custom_white_list, custom_black_list = None, None
@@ -163,6 +160,14 @@ def __init__(self, main_program, inputs, outputs, parameters=None,
             custom_white_list=custom_white_list,
             custom_black_list=custom_black_list)
 
+    @LazyInitialized
+    def __fake_vars(self):
+        return _create_fake_var()
+
+    @LazyInitialized
+    def _double_grads(self):
+        return self._get_double_grads(self._origin_main_program)
+
     @LazyInitialized
     def _infer_program(self):
         """
@@ -356,8 +361,10 @@ def _cast_fp16_if_pure_fp16(self, in_vars):
 
     def drop_scope_if_no_grad(self):
         tracer = framework._dygraph_tracer()
+        scope = self._tmp_scope_vec.value().get_scope() if isinstance(
+            self._tmp_scope_vec, (core.VarBase)) else self._tmp_scope_vec[0]
         if self.training and not tracer._has_grad:
-            self._tmp_scope_vec.value().get_scope().drop_kids()
+            scope.drop_kids()
 
     @property
     def program(self):
@@ -449,18 +456,14 @@ def create_out(var_id):
     def _create_scope_vec(self):
         # Hold forward variables
         tmp_scope_vec = None
+        inner_scope = core.Scope()
         if not core._in_eager_mode():
             tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
                                          "program_out_scope",
                                          core.VarDesc.VarType.STEP_SCOPES, True)
-            # TODO(jiabin): Support this later.
-            # else:
-            #     tmp_scope_vec = core.eager.Tensor(core.VarDesc.VarType.FP32, [],
-            #                                 "program_out_scope",
-            #                                 core.VarDesc.VarType.STEP_SCOPES, True)
-
-            inner_scope = core.Scope()
             tmp_scope_vec.value().set_scope(inner_scope)
+        else:
+            tmp_scope_vec = [inner_scope]
         return tmp_scope_vec
 
     def _restore_out(self, out_vars):
@@ -598,12 +601,10 @@ def _create_fake_var():
                          core.VarDesc.VarType.RAW, False)
         ]
     else:
-        return []
-        # TODO(jiabin): Support this later
-        # return [
-        #     core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var",
-        #                 core.VarDesc.VarType.RAW, False)
-        # ]
+        return [
+            core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var",
+                              core.VarDesc.VarType.RAW, False)
+        ]
 
 
 def partial_program_from(concrete_program):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
index e1df2324889b4..7733226cc09f2 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
@@ -297,10 +297,6 @@ def _is_var_shape(self, node):
         return False
 
     def _update_name_to_var_shape(self, node):
-        def replace_dot(name):
-            # replace all '.' into '_'
-            return name.replace('.', '_')
-
         assert isinstance(node, gast.Assign)
         target_node = node.targets[0]
         value_node = node.value
@@ -315,7 +311,6 @@ def replace_dot(name):
                     if value_node.id in self.name_to_var_shape:
                         # TODO(zhhsplendid): is context a problem for the result node of gast.parse?
                         static_shape_var_name = unique_name.generate(
-                            replace_dot(target_id) +
                             STATIC_CONVERT_VAR_SHAPE_SUFFIX)
                         static_shape_var_node = gast.parse(
                             static_shape_var_name).body[0].value
@@ -337,7 +332,6 @@ def replace_dot(name):
                 if isinstance(value_node, gast.Attribute):
                     if self._is_var_shape(value_node):  # eg: x.shape
                         static_shape_var_name = unique_name.generate(
-                            replace_dot(target_id) +
                             STATIC_CONVERT_VAR_SHAPE_SUFFIX)
                         static_shape_var_node = gast.parse(
                             static_shape_var_name).body[0].value
@@ -370,7 +364,6 @@ def replace_dot(name):
             if isinstance(value_node, gast.Name):
                 if value_node.id in self.name_to_var_shape:
                     static_shape_var_name = unique_name.generate(
-                        replace_dot(target_id) +
                         STATIC_CONVERT_VAR_SHAPE_SUFFIX)
                     static_shape_var_node = gast.parse(
                         static_shape_var_name).body[0].value
@@ -387,7 +380,7 @@ def replace_dot(name):
                     self.name_to_var_shape[target_id] = static_shape_var_name
             elif self._is_var_shape(value_node):  # eg: x.shape or x.shape[0]
                 static_shape_var_name = unique_name.generate(
-                    replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX)
+                    STATIC_CONVERT_VAR_SHAPE_SUFFIX)
                 static_shape_var_node = gast.parse(static_shape_var_name).body[
                     0].value
                 static_shape_value_node = copy.deepcopy(value_node)
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index f58952d3036c5..a36164a277dec 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -30,6 +30,7 @@
 from paddle.fluid.layers.utils import _hash_with_id
 from paddle.fluid.dygraph.base import switch_to_static_graph
 from paddle.fluid.framework import in_dygraph_mode
+from paddle import _C_ops
 
 __all__ = ['TranslatedLayer']
 
@@ -761,6 +762,21 @@ def _construct_params_and_buffers(model_path,
     return var_dict
 
 
+def _valid_vars(vars):
+    if vars:
+        return vars
+    if framework._in_eager_mode():
+        return [
+            core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var",
+                              core.VarDesc.VarType.RAW, False)
+        ]
+    else:
+        return [
+            core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var",
+                         core.VarDesc.VarType.RAW, False)
+        ]
+
+
 def _run_dygraph(instance, input, program_holder):
 
     # 1. prepare inputs, outputs, attrs
@@ -826,17 +842,12 @@ def _run_dygraph(instance, input, program_holder):
 
     # hold forward variables
     if framework._in_eager_mode():
-        tmp_scope_vec = core.eager.Tensor(
-            dtype=core.VarDesc.VarType.FP32,
-            dims=[],
-            name="program_out_scope",
-            type=core.VarDesc.VarType.STEP_SCOPES,
-            persistable=True)
+        tmp_scope_vec = [program_holder.scope]
     else:
         tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
                                      "program_out_scope",
                                      core.VarDesc.VarType.STEP_SCOPES, True)
-    tmp_scope_vec.value().set_scope(program_holder.scope)
+        tmp_scope_vec.value().set_scope(program_holder.scope)
 
     double_grad_vars = []
     for var_desc in program_holder.double_grad_descs:
@@ -852,41 +863,18 @@ def _run_dygraph(instance, input, program_holder):
                                var_desc.shape(),
                                var_desc.name(), var_desc.type(), False)
         double_grad_vars.append(var)
-    if len(double_grad_vars) == 0:
-        if framework._in_eager_mode():
-            double_grad_vars = [
-                core.eager.Tensor(
-                    value=[1],
-                    name='Fake_var',
-                    place=framework._current_expected_place())
-            ]
-        else:
-            double_grad_vars = [
-                core.VarBase(
-                    value=[1],
-                    name='Fake_var',
-                    place=framework._current_expected_place())
-            ]
 
     # 2. run program by op
     trace_program = program_holder.infer_program if instance._is_test else program_holder.train_program
     end_op_index = program_holder.infer_program.block(0).op_size()
-    framework._dygraph_tracer().trace_op(
-        type='run_program',
-        inputs={'X': input_vars,
-                'Params': persistable_vars},
-        outputs={
-            'Out': output_vars,
-            'OutScope': tmp_scope_vec,
-            'DOut': double_grad_vars
-        },
-        attrs={
-            'global_block': trace_program.block(0),
-            'start_op_index': 0,
-            'end_op_index': end_op_index,
-            'is_test': instance._is_test,
-            'program_id': _hash_with_id(trace_program, instance)
-        })
+    attrs = ('global_block', trace_program.block(0), 'start_op_index', 0,
+             'end_op_index', end_op_index, 'is_test', instance._is_test,
+             'program_id', _hash_with_id(trace_program, instance))
+    _C_ops.run_program(
+        _valid_vars(input_vars),
+        _valid_vars(persistable_vars),
+        _valid_vars(output_vars), tmp_scope_vec,
+        _valid_vars(double_grad_vars), *attrs)
     # NOTE: [ why need set param's gradient type here ]
     # if user set sparse gradient mode, the param's gradient
     # will be SelectedRows, not LoDTensor. But tracer will just
@@ -914,8 +902,10 @@ def _run_dygraph(instance, input, program_holder):
 
 def drop_scope_if_no_grad(instance, scope_vec):
     tracer = framework._dygraph_tracer()
+    scope = scope_vec.value().get_scope() if isinstance(scope_vec, (
+        core.VarBase)) else scope_vec[0]
     if (not instance._is_test) and (not tracer._has_grad):
-        scope_vec.value().get_scope().drop_kids()
+        scope.drop_kids()
 
 
 def _run_static_graph(input, program_holder, trace_program):
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index b1865691b2475..1e1ce3ba7e491 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -821,7 +821,7 @@ def fun(inputs):
         for var in flatten(input_spec):
             if isinstance(var, paddle.static.InputSpec):
                 inner_input_spec.append(var)
-            elif isinstance(var, (core.VarBase, Variable)):
+            elif isinstance(var, (core.VarBase, core.eager.Tensor, Variable)):
                 inner_input_spec.append(
                     paddle.static.InputSpec.from_tensor(var))
             else:
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 16244618e7997..aec2c888b7f71 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -540,9 +540,8 @@ def parameters(self, include_sublayers=True):
 
         """
         ret = [
-            param
-            for _, param in self.named_parameters(
-                include_sublayers=include_sublayers)
+            param for _, param in
+            self.named_parameters(include_sublayers=include_sublayers)
         ]
         return ret
 
@@ -761,7 +760,8 @@ def register_buffer(self, name, tensor, persistable=True):
             raise KeyError("The name of buffer can not be empty.")
         elif hasattr(self, name) and name not in self._buffers:
             raise KeyError("attribute '{}' already exists.".format(name))
-        elif tensor is not None and not type(tensor) == core.VarBase:
+        elif tensor is not None and not (type(tensor) == core.VarBase or
+                                         type(tensor) == core.eager.Tensor):
             raise TypeError(
                 "The registered buffer should be a core.VarBase, but received {}.".
                 format(type(tensor).__name__))
@@ -797,9 +797,8 @@ def buffers(self, include_sublayers=True):
 
         """
         ret = [
-            buffer
-            for _, buffer in self.named_buffers(
-                include_sublayers=include_sublayers)
+            buffer for _, buffer in
+            self.named_buffers(include_sublayers=include_sublayers)
         ]
         return ret
 
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 652916491eed7..86d76f1b20a74 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -30,7 +30,7 @@
 from paddle.utils import deprecated
 from ..layers import collective
 from paddle.fluid.dygraph import base as imperative_base
-from paddle.fluid.framework import ParamBase
+from paddle.fluid.framework import ParamBase, _in_eager_mode
 
 __all__ = ["prepare_context", "ParallelEnv", "DataParallel"]
 
@@ -397,6 +397,16 @@ def sync_params_buffers(model,
                    'axis': 0})
 
 
+@imperative_base.no_grad
+@framework.dygraph_only
+def sync_eager_params(model, comm_group=None, src_rank=0):
+    for _, param in model._obtain_parameters_buffers().items():
+        if not isinstance(param, core.eager.Tensor):
+            raise TypeError("The data type of '%s' must be '%s'" %
+                            (param.name, core.eager.Tensor))
+        comm_group.broadcast(param, src_rank).synchronize()
+
+
 class DataParallel(layers.Layer):
     """
     Run the dygraph module with data parallelism.
@@ -576,6 +586,7 @@ def __init__(self,
         self.process_group = process_group
         self.gradient_as_buffer_view = gradient_as_buffer_view
         self.static_graph = static_graph
+        self.var_dtype = core.eager.Tensor if _in_eager_mode() else core.VarBase
 
         # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. 
         # It just stores some environment variables, which can be constructed by 
@@ -592,11 +603,20 @@ def __init__(self,
             "ParallelContext must be initialized before. You should use init_parallel_env() before" \
             "constructing the DataParallel."
 
+            if self.process_group is None and _in_eager_mode():
+                raise RuntimeError(
+                    "Process group should be built in DataParallel of eager mode."
+                )
+
             # sync buffer and params
             # TODO(liuyuhui) Currently not support xpu. xpu is 
             # still broadcasting parameters when calling layer
             if not paddle.is_compiled_with_xpu():
-                sync_params_buffers(self._layers)
+                if _in_eager_mode():
+                    sync_eager_params(
+                        self._layers, comm_group=self.process_group)
+                else:
+                    sync_params_buffers(self._layers)
 
             self.comm_buffer_size = int(comm_buffer_size * 1024 * 1024)
             # NOTE(shenliang03): We can set environment variables to control 
@@ -620,9 +640,9 @@ def init_reducer(self):
                 if param is None or param in params_set:
                     continue
                 params_set.add(param)
-                if not isinstance(param, core.VarBase):
-                    raise TypeError("The data type of '%s' must be Varbase" %
-                                    param.name)
+                if not isinstance(param, self.var_dtype):
+                    raise TypeError("The data type of '%s' must be '%s'" %
+                                    (param.name, self.var_dtype))
                 if param.trainable:
                     layers_param.append((sublayer, param))
 
@@ -649,19 +669,32 @@ def check_layer_sparse(sublayer):
             check_layer_sparse(sublayer) for sublayer, _ in layers_param
         ]
 
-        self.group_indices = core.assign_group_by_size(
-            trainable_parameters, is_sparse_gradient,
-            [self.last_comm_buffer_size, self.comm_buffer_size])
+        if _in_eager_mode():
+            self.group_indices = core.eager_assign_group_by_size(
+                trainable_parameters, is_sparse_gradient,
+                [self.last_comm_buffer_size, self.comm_buffer_size])
+
+            self._reducer = core.EagerReducer(
+                trainable_parameters,
+                list(reversed(self.group_indices)), is_sparse_gradient,
+                self.process_group,
+                [self.last_comm_buffer_size, self.comm_buffer_size],
+                self.find_unused_parameters)
+        else:
+            self.group_indices = core.assign_group_by_size(
+                trainable_parameters, is_sparse_gradient,
+                [self.last_comm_buffer_size, self.comm_buffer_size])
 
-        self._reducer = core.Reducer(
-            trainable_parameters,
-            list(reversed(self.group_indices)), is_sparse_gradient,
-            parallel_helper.__parallel_ctx__clz__,
-            [self.last_comm_buffer_size, self.comm_buffer_size],
-            self.find_unused_parameters)
+            self._reducer = core.Reducer(
+                trainable_parameters,
+                list(reversed(self.group_indices)), is_sparse_gradient,
+                parallel_helper.__parallel_ctx__clz__,
+                [self.last_comm_buffer_size, self.comm_buffer_size],
+                self.find_unused_parameters)
 
     def _find_varbase(self, obj):
-        if isinstance(obj, core.VarBase):
+        var_type = core.eager.Tensor if _in_eager_mode() else core.VarBase
+        if isinstance(obj, var_type):
             return [obj]
         if isinstance(obj, (list, tuple)):
             return itertools.chain(*map(self._find_varbase, obj))
diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py
index 563cd43391005..1a8cc77e4def5 100644
--- a/python/paddle/fluid/dygraph/tracer.py
+++ b/python/paddle/fluid/dygraph/tracer.py
@@ -30,11 +30,22 @@
         "y": "Y",
         "out": "Out",
     },
+    # "elementwise_add": {
+    #     "final_op_name": "final_state_add",
+    #     "x": "X",
+    #     "y": "Y",
+    # },
     "trunc": {
         "final_op_name": "final_state_trunc",
         "x": "X",
         "out": "Out",
     },
+    "pool2d": {
+        "final_op_name": "final_state_pool2d",
+        "x": "X",
+        "kernel_size": "ksize",
+        "out": "Out",
+    },
     "abs": {
         "final_op_name": "final_state_abs",
         "x": "X",
@@ -52,6 +63,12 @@
         "axis1": "axis1",
         "axis2": "axis2",
         "out": "Out",
+    },
+    "one_hot": {
+        "final_op_name": "final_state_one_hot",
+        "x": "X",
+        "num_class": "depth",
+        "out": "Out",
     }
 }
 
@@ -140,7 +157,12 @@ def eager_trace_op(self,
                             outputs[retname][j].reconstruct_from_(returns[i][j],
                                                                   False)
                     else:
-                        outputs[retname][0].reconstruct_from_(returns[i], False)
+                        if isinstance(outputs[retname], list):
+                            outputs[retname][0].reconstruct_from_(returns[i],
+                                                                  False)
+                        else:
+                            outputs[retname].reconstruct_from_(returns[i],
+                                                               False)
         elif isinstance(returns, list):
             assert len(outputs.keys()) == 1
             key = list(outputs.keys())[0]
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 4b3aa8a2760df..24284ca78c1ce 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -29,6 +29,7 @@
 from paddle.fluid.data_feeder import convert_dtype, _PADDLE_DTYPE_2_NUMPY_DTYPE
 import paddle.utils.deprecated as deprecated
 import paddle.profiler as profiler
+from paddle import _C_ops
 
 
 class TensorHookRemoveHelper(object):
@@ -199,8 +200,8 @@ def backward(self, grad_tensor=None, retain_graph=False):
         You can clear gradient by ``Tensor.clear_grad()`` .
 
         Args:
-            grad_tensor(Tensor, optional): initial gradient values of the current Tensor. If `grad_tensor` is None, 
-            the initial gradient values of the current Tensor would be Tensor filled with 1.0; 
+            grad_tensor(Tensor, optional): initial gradient values of the current Tensor. If `grad_tensor` is None,
+            the initial gradient values of the current Tensor would be Tensor filled with 1.0;
             if `grad_tensor` is not None, it must have the same length as the current Tensor.
             Teh default value is None.
 
@@ -316,7 +317,7 @@ def gradient(self):
 
         """
         if core._in_eager_mode():
-            if not self.grad._is_initialized():
+            if self.grad is None:
                 return None
             # TODO(wanghuancoder) support SELECTED_ROWS
             return self.grad.numpy()
@@ -480,7 +481,7 @@ def transform(t, device, dtype, blocking):
     def grad(self):
         """
         .. warning::
-          This API will return the tensor value of the gradient. If you want 
+          This API will return the tensor value of the gradient. If you want
           to get the numpy value of the gradient, you can use :code:`x.grad.numpy()`.
 
         Get the Gradient of Current Tensor.
@@ -519,7 +520,7 @@ def clear_grad(self):
 
     def item(self, *args):
         """
-        Convert element at specific position in Tensor into Python scalars. If the position is not specified, the Tensor must be a 
+        Convert element at specific position in Tensor into Python scalars. If the position is not specified, the Tensor must be a
         single-element Tensor.
 
         Args:
@@ -530,7 +531,7 @@ def item(self, *args):
 
         Raises:
             ValueError: If the Tensor has more than one element, there must be coordinates.
-        
+
         Examples:
             .. code-block:: python
 
@@ -592,7 +593,7 @@ def __str__(self):
                 import paddle
                 x = paddle.rand([2, 5])
                 print(x)
-                
+
                 # Tensor(shape=[2, 5], dtype=float32, place=CPUPlace,
                 #        [[0.30574632, 0.55739117, 0.30902600, 0.39413780, 0.44830436],
                 #         [0.79010487, 0.53972793, 0.09495186, 0.44267157, 0.72112119]])
@@ -615,7 +616,7 @@ def __deepcopy__(self, memo):
                 import copy
                 x = paddle.to_tensor(2.)
                 y = copy.deepcopy(x)
-                
+
                 print(x)
                 # Tensor(shape=[1], dtype=float32, place=CPUPlace, stop_gradient=True,
                 #        [2.])
@@ -659,7 +660,7 @@ def __bool__(self):
     def __array__(self, dtype=None):
         """
         Returns a numpy array shows the value of current Tensor.
-        
+
         Returns:
             ndarray: The numpy value of current Tensor.
 
@@ -787,7 +788,7 @@ def _set_grad_ivar(self, value):
 
     @framework.dygraph_only
     def clone(self):
-        return _C_ops_.assign(self)
+        return _C_ops.assign(self)
 
     @framework.dygraph_only
     def value(self):
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index d0a94238a7aeb..fb787215d910e 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -316,7 +316,8 @@ def __impl__(*args, **kwargs):
 
 def _dygraph_only_(func):
     def __impl__(*args, **kwargs):
-        assert in_dygraph_mode(
+        assert (
+            in_dygraph_mode() or _in_eager_mode()
         ), "We only support '%s()' in dynamic graph mode, please call 'paddle.disable_static()' to enter dynamic graph mode." % func.__name__
         return func(*args, **kwargs)
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index fd7226c48661f..6350ed18e6666 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -5611,6 +5611,8 @@ def transpose(x, perm, name=None):
 
     """
     if in_dygraph_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_transpose(x, perm)
         out, _ = _C_ops.transpose2(x, 'axis', perm)
         return out
 
@@ -6299,7 +6301,14 @@ def get_attr_shape(list_shape):
                 if dim_size == -1:
                     assert unk_dim_idx == -1, (
                         "Only one dimension value of 'shape' in reshape can "
-                        "be -1. But received shape[%d] is also -1." % dim_idx)
+                        "be -1. But received shape[%d] is also -1.\n"
+                        "\n\t# N = x.shape()[2]\t\t# N is an int. "
+                        "(NOT recommend under @to_static)\n\tN = paddle.shape(x)[2]\t\t"
+                        "# N is a Tensor. (Recommend)\n\tz = paddle.reshape([N, -1, 4])"
+                        "\t# z.shape is [-1, -1, 4]\n\n"
+                        "    If your target shape in Reshape represents dynamic shape, "
+                        "please turn it into a Tensor under @to_static. See above example for details."
+                        % dim_idx)
                     unk_dim_idx = dim_idx
                 elif dim_size == 0:
                     assert dim_idx < len(x.shape), (
@@ -8543,6 +8552,8 @@ def gather_nd(input, index, name=None):
 
     """
     if in_dygraph_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_gather_nd(input, index)
         return _C_ops.gather_nd(input, index)
     check_variable_and_dtype(
         input, 'input',
@@ -8719,6 +8730,8 @@ def scatter_nd_add(ref, index, updates, name=None):
     """
 
     if in_dygraph_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_scatter_nd_add(ref, index, updates)
         op = getattr(_C_ops, 'scatter_nd_add')
         return op(ref, index, updates)
 
@@ -15285,6 +15298,8 @@ def gather_tree(ids, parents):
 
     """
     if in_dygraph_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_gather_tree(ids, parents)
         return _C_ops.gather_tree(ids, parents)
     else:
         helper = LayerHelper('gather_tree', **locals())
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
index c89990be34ca0..acaf7cb74280b 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
@@ -153,6 +153,7 @@ PD_BUILD_GRAD_OP(custom_relu_no_x_in_backward)
     .SetInferShapeFn(PD_INFER_SHAPE(ReluBackwardWithoutXInferShape));
 
 void relu_cpu_forward_out(const paddle::Tensor& x, paddle::Tensor* out) {
+  out->reshape(x.shape());
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "relu_cpu_forward", ([&] {
         relu_cpu_forward_kernel<data_t>(
@@ -164,6 +165,7 @@ void relu_cpu_backward_out(const paddle::Tensor& x,
                            const paddle::Tensor& out,
                            const paddle::Tensor& grad_out,
                            paddle::Tensor* grad_x) {
+  grad_x->reshape(x.shape());
   PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
                                relu_cpu_backward_kernel<data_t>(
                                    grad_out.data<data_t>(),
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
index 33c5ede299bd4..4bb773cdaec21 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
@@ -94,6 +94,7 @@ void relu_cuda_forward_out(const paddle::Tensor& x, paddle::Tensor* out) {
   int numel = x.size();
   int block = 512;
   int grid = (numel + block - 1) / block;
+  out->reshape(x.shape());
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       x.type(), "relu_cuda_forward_kernel", ([&] {
         relu_cuda_forward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
@@ -108,6 +109,7 @@ void relu_cuda_backward_out(const paddle::Tensor& x,
   int numel = out.size();
   int block = 512;
   int grid = (numel + block - 1) / block;
+  grad_x->reshape(x.shape());
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       out.type(), "relu_cuda_backward_kernel", ([&] {
         relu_cuda_backward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py
index 1c9c6eedbaeb8..785bfc7422981 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py
@@ -20,6 +20,7 @@
 from paddle.utils.cpp_extension import load, get_build_directory
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args
 from paddle.utils.cpp_extension.extension_utils import run_cmd
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 # Because Windows don't use docker, the shared lib already exists in the 
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -53,7 +54,7 @@ def setUp(self):
         self.int64_vec_attr = [10000000000, 10000000000, 10000000000]
         self.str_vec_attr = ["StrAttr", "StrAttr", "StrAttr"]
 
-    def test_attr_value(self):
+    def func_attr_value(self):
         x = paddle.ones([2, 2], dtype='float32')
         x.stop_gradient = False
         out = custom_attrs.attr_test(
@@ -65,7 +66,12 @@ def test_attr_value(self):
 
         self.assertTrue(np.array_equal(x.numpy(), out.numpy()))
 
-    def test_const_attr_value(self):
+    def test_attr_value(self):
+        with _test_eager_guard():
+            self.func_attr_value()
+        self.func_attr_value()
+
+    def func_const_attr_value(self):
         x = paddle.ones([2, 2], dtype='float32')
         x.stop_gradient = False
         out = custom_attrs.const_attr_test(
@@ -77,6 +83,11 @@ def test_const_attr_value(self):
 
         self.assertTrue(np.array_equal(x.numpy(), out.numpy()))
 
+    def test_const_attr_value(self):
+        with _test_eager_guard():
+            self.func_const_attr_value()
+        self.func_const_attr_value()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_concat.py b/python/paddle/fluid/tests/custom_op/test_custom_concat.py
index 9049b604c910c..62e61c5bc7f5f 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_concat.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_concat.py
@@ -21,6 +21,7 @@
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -116,7 +117,7 @@ def check_output(self, out, pd_out, name):
             "custom op {}: {},\n paddle api {}: {}".format(name, out, name,
                                                            pd_out))
 
-    def test_dynamic(self):
+    def func_dynamic(self):
         for dtype in self.dtypes:
             for axis in self.axises:
                 out, grad_inputs = concat_dynamic(custom_ops.custom_concat,
@@ -128,6 +129,11 @@ def test_dynamic(self):
                 for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs):
                     self.check_output(x_grad, pd_x_grad, "x_grad")
 
+    def test_dynamic(self):
+        with _test_eager_guard():
+            self.func_dynamic()
+        self.func_dynamic()
+
     def test_static(self):
         for dtype in self.dtypes:
             for axis in self.axises:
@@ -140,7 +146,7 @@ def test_static(self):
                 self.check_output(x1_grad, pd_x1_grad, "x1_grad")
                 self.check_output(x2_grad, pd_x2_grad, "x2_grad")
 
-    def test_dynamic_with_attr(self):
+    def func_dynamic_with_attr(self):
         for dtype in self.dtypes:
             for axis in self.axises:
                 out, grad_inputs = concat_dynamic(
@@ -153,6 +159,11 @@ def test_dynamic_with_attr(self):
                 for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs):
                     self.check_output(x_grad, pd_x_grad, "x_grad")
 
+    def test_dynamic_with_attr(self):
+        with _test_eager_guard():
+            self.func_dynamic_with_attr()
+        self.func_dynamic_with_attr()
+
     def test_static_with_attr(self):
         for dtype in self.dtypes:
             for axis in self.axises:
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_conj.py b/python/paddle/fluid/tests/custom_op/test_custom_conj.py
index 25c88ee6c6b01..5f3c107a9b22a 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_conj.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_conj.py
@@ -21,6 +21,7 @@
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args
+from paddle.fluid.framework import _test_eager_guard
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -116,11 +117,16 @@ def run_static(self, dtype, np_input):
         self.check_output(out, pd_out, "out")
         self.check_output(x_grad, pd_x_grad, "x's grad")
 
-    def test_dynamic(self):
+    def func_dynamic(self):
         for dtype in self.dtypes:
             np_input = np.random.random(self.shape).astype(dtype)
             self.run_dynamic(dtype, np_input)
 
+    def test_dynamic(self):
+        with _test_eager_guard():
+            self.func_dynamic()
+        self.func_dynamic()
+
     def test_static(self):
         for dtype in self.dtypes:
             np_input = np.random.random(self.shape).astype(dtype)
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_linear.py b/python/paddle/fluid/tests/custom_op/test_custom_linear.py
index 0ba70eaa3e06c..811eedf1edaf3 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_linear.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_linear.py
@@ -22,6 +22,7 @@
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -94,7 +95,7 @@ def test_static(self):
                                    self.np_bias)
             self.check_output(pten_out, pd_out, "pten_out")
 
-    def test_dynamic(self):
+    def func_dynamic(self):
         for dtype in self.dtypes:
             pten_out = linear_dynamic(custom_ops.pten_linear, dtype, self.np_x,
                                       self.np_weight, self.np_bias)
@@ -102,6 +103,11 @@ def test_dynamic(self):
                                     self.np_bias)
             self.check_output(pten_out, pd_out, "pten_out")
 
+    def test_dynamic(self):
+        with _test_eager_guard():
+            self.func_dynamic()
+        self.func_dynamic()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py b/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py
index 207ea87974130..4da99b1ea1041 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py
@@ -68,12 +68,6 @@ def custom_raw_relu(self, x):
         self.assertTrue(custom_raw_relu_op is not None)
         return custom_raw_relu_op(x)
 
-    def test_dygraph(self):
-        x = paddle.to_tensor(np.random.uniform(low=-1.0, high=1.0, size=[2, 3]))
-        y1 = self.custom_raw_relu(x)
-        y2 = paddle.nn.ReLU()(x)
-        self.assertTrue(np.array_equal(y1.numpy(), y2.numpy()))
-
     def test_static(self):
         paddle.enable_static()
         shape = [2, 3]
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
index dddb14eb78e8a..81793f1391d04 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
@@ -22,6 +22,7 @@
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args, IS_MAC
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -98,7 +99,7 @@ def setUp(self):
         self.x_spec = paddle.static.InputSpec(
             shape=[None, self.in_dim], dtype='float32', name='x')
 
-    def test_train_eval(self):
+    def func_train_eval(self):
         for device in self.devices:
             # set device
             paddle.set_device(device)
@@ -106,26 +107,34 @@ def test_train_eval(self):
             # for train
             origin_relu_train_out = self.train_model(use_custom_op=False)
             custom_relu_train_out = self.train_model(use_custom_op=True)
-            custom_relu_dy2stat_train_out = self.train_model(
-                use_custom_op=True, dy2stat=True)  # for to_static
+            # open this when dy2stat is ready for eager 
+            if not _in_eager_mode():
+                custom_relu_dy2stat_train_out = self.train_model(
+                    use_custom_op=True, dy2stat=True)  # for to_static
+                self.assertTrue(
+                    np.array_equal(origin_relu_train_out,
+                                   custom_relu_dy2stat_train_out))
 
             self.assertTrue(
                 np.array_equal(origin_relu_train_out, custom_relu_train_out))
-            self.assertTrue(
-                np.array_equal(origin_relu_train_out,
-                               custom_relu_dy2stat_train_out))
 
             # for eval
             origin_relu_eval_out = self.eval_model(use_custom_op=False)
             custom_relu_eval_out = self.eval_model(use_custom_op=True)
-            custom_relu_dy2stat_eval_out = self.eval_model(
-                use_custom_op=True, dy2stat=True)  # for to_static
+            if not _in_eager_mode():
+                custom_relu_dy2stat_eval_out = self.eval_model(
+                    use_custom_op=True, dy2stat=True)  # for to_static
+                self.assertTrue(
+                    np.array_equal(origin_relu_eval_out,
+                                   custom_relu_dy2stat_eval_out))
 
             self.assertTrue(
                 np.array_equal(origin_relu_eval_out, custom_relu_eval_out))
-            self.assertTrue(
-                np.array_equal(origin_relu_eval_out,
-                               custom_relu_dy2stat_eval_out))
+
+    def test_train_eval(self):
+        with _test_eager_guard():
+            self.func_train_eval()
+        self.func_train_eval()
 
     def train_model(self, use_custom_op=False, dy2stat=False):
         # reset random seed
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
index 407eb342ba99b..a747d10823ec5 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
@@ -20,7 +20,7 @@
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args, IS_WINDOWS, IS_MAC
 from test_custom_relu_op_setup import custom_relu_dynamic, custom_relu_static
-
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
 file = '{}\\custom_relu_module_jit\\custom_relu_module_jit.pyd'.format(
@@ -75,7 +75,7 @@ def test_static(self):
                         "custom op out: {},\n paddle api out: {}".format(
                             out, pd_out))
 
-    def test_dynamic(self):
+    def func_dynamic(self):
         for device in self.devices:
             for dtype in self.dtypes:
                 if device == 'cpu' and dtype == 'float16':
@@ -95,8 +95,14 @@ def test_dynamic(self):
                         "custom op x grad: {},\n paddle api x grad: {}".format(
                             x_grad, pd_x_grad))
 
-    def test_exception(self):
+    def test_dynamic(self):
+        with _test_eager_guard():
+            self.func_dynamic()
+        self.func_dynamic()
+
+    def func_exception(self):
         caught_exception = False
+        # if not _in_eager_mode():
         try:
             x = np.random.uniform(-1, 1, [4, 8]).astype('int32')
             custom_relu_dynamic(custom_module.custom_relu, 'cpu', 'int32', x)
@@ -114,11 +120,11 @@ def test_exception(self):
                     "python/paddle/fluid/tests/custom_op/custom_relu_op.cc" in
                     str(e))
         self.assertTrue(caught_exception)
-
         caught_exception = False
         # MAC-CI don't support GPU
         if IS_MAC:
             return
+        # if not _in_eager_mode():
         try:
             x = np.random.uniform(-1, 1, [4, 8]).astype('int32')
             custom_relu_dynamic(custom_module.custom_relu, 'gpu', 'int32', x)
@@ -132,6 +138,11 @@ def test_exception(self):
                 str(e))
         self.assertTrue(caught_exception)
 
+    def test_exception(self):
+        with _test_eager_guard():
+            self.func_exception()
+        self.func_exception()
+
     def test_load_multiple_module(self):
         custom_module = load(
             name='custom_conj_jit',
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
index 0af0aa16466ea..7c61e11a18ecd 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
@@ -21,6 +21,7 @@
 import subprocess
 import numpy as np
 from paddle.utils.cpp_extension.extension_utils import run_cmd
+from paddle.fluid.framework import _test_eager_guard
 
 
 def custom_relu_dynamic(func, device, dtype, np_x, use_func=True):
@@ -216,7 +217,7 @@ def test_static_pe(self):
                         "custom op out: {},\n paddle api out: {}".format(
                             out, pd_out))
 
-    def test_dynamic(self):
+    def func_dynamic(self):
         for device in self.devices:
             for dtype in self.dtypes:
                 if device == 'cpu' and dtype == 'float16':
@@ -236,6 +237,11 @@ def test_dynamic(self):
                         "custom op x grad: {},\n paddle api x grad: {}".format(
                             x_grad, pd_x_grad))
 
+    def test_dynamic(self):
+        with _test_eager_guard():
+            self.func_dynamic()
+        self.func_dynamic()
+
     def test_static_save_and_load_inference_model(self):
         paddle.enable_static()
         np_data = np.random.random((1, 1, 28, 28)).astype("float32")
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py b/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py
index c60bac4060b64..f68a37b1a2f3b 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py
@@ -20,6 +20,7 @@
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -39,7 +40,7 @@
 
 
 class TestCustomSimpleSliceJit(unittest.TestCase):
-    def test_slice_output(self):
+    def func_slice_output(self):
         np_x = np.random.random((5, 2)).astype("float32")
         x = paddle.to_tensor(np_x)
         custom_op_out = custom_ops.custom_simple_slice(x, 2, 3)
@@ -48,6 +49,11 @@ def test_slice_output(self):
             np.array_equal(custom_op_out, np_out),
             "custom op: {},\n numpy: {}".format(np_out, custom_op_out.numpy()))
 
+    def test_slice_output(self):
+        with _test_eager_guard():
+            self.func_slice_output()
+        self.func_slice_output()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
index 12e9f50a5e409..0d2cb941eafaa 100644
--- a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
@@ -19,7 +19,7 @@
 from paddle.utils.cpp_extension import load, get_build_directory
 from utils import paddle_includes, extra_cc_args
 from paddle.utils.cpp_extension.extension_utils import run_cmd
-
+from paddle.fluid.framework import _test_eager_guard
 # Because Windows don't use docker, the shared lib already exists in the 
 # cache dir, it will not be compiled again unless the shared lib is removed.
 file = '{}\\dispatch_op\\dispatch_op.pyd'.format(get_build_directory())
@@ -39,7 +39,7 @@ class TestJitDispatch(unittest.TestCase):
     def setUp(self):
         paddle.set_device('cpu')
 
-    def run_dispatch_test(self, func, dtype):
+    def run_dispatch_test_impl(self, func, dtype):
         np_x = np.ones([2, 2]).astype(dtype)
         x = paddle.to_tensor(np_x)
         out = func(x)
@@ -50,6 +50,11 @@ def run_dispatch_test(self, func, dtype):
             np.array_equal(np_x, np_out),
             "custom op x: {},\n custom op out: {}".format(np_x, np_out))
 
+    def run_dispatch_test(self, func, dtype):
+        with _test_eager_guard():
+            self.run_dispatch_test_impl(func, dtype)
+        self.run_dispatch_test_impl(func, dtype)
+
     def test_dispatch_integer(self):
         dtypes = ["int32", "int64", "int8", "uint8", "int16"]
         for dtype in dtypes:
diff --git a/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py b/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py
index 97b37498c4d3d..4fc9270b0f44c 100644
--- a/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py
@@ -22,7 +22,7 @@
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args
-
+from paddle.fluid.framework import _test_eager_guard
 # Because Windows don't use docker, the shared lib already exists in the 
 # cache dir, it will not be compiled again unless the shared lib is removed.
 file = '{}\\multi_out_jit\\multi_out_jit.pyd'.format(get_build_directory())
@@ -84,7 +84,7 @@ def test_static(self):
                 self.check_multi_outputs(res)
         paddle.disable_static()
 
-    def test_dynamic(self):
+    def func_dynamic(self):
         for device in self.devices:
             for dtype in self.dtypes:
                 paddle.set_device(device)
@@ -95,6 +95,11 @@ def test_dynamic(self):
                 self.assertTrue(len(outs) == 3)
                 self.check_multi_outputs(outs, True)
 
+    def test_dynamic(self):
+        with _test_eager_guard():
+            self.func_dynamic()
+        self.func_dynamic()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 9348b0b50a1c0..c45045509201d 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -886,6 +886,7 @@ def test_distribute_fpn_proposals(self):
                 refer_level=4,
                 refer_scale=224,
                 rois_num=rois_num_dy)
+            print(type(multi_rois_dy))
             output_dy = multi_rois_dy + [restore_ind_dy] + rois_num_per_level_dy
             output_dy_np = []
             for output in output_dy:
@@ -973,4 +974,5 @@ def generate_input(pb_type, pbv_type, loc_type, score_type, name):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 9b0c857576b8a..2acf530eea3fb 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -24,6 +24,7 @@ list(APPEND DIST_TEST_OPS test_pipeline)
 list(APPEND DIST_TEST_OPS test_ir_pass_pipeline)
 list(APPEND DIST_TEST_OPS test_static_model_parallel)
 list(APPEND DIST_TEST_OPS test_static_model_parallel_fused_feedforward)
+list(APPEND DIST_TEST_OPS test_static_model_parallel_fused_attention)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
@@ -556,6 +557,7 @@ if (APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_exception)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_iterable_dataset)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_dataset)
+  list(REMOVE_ITEM TEST_OPS test_paddle_multiprocessing)
 endif()
 
 if (NOT WITH_GLOO)
@@ -947,6 +949,7 @@ if (WITH_DISTRIBUTE AND NOT APPLE)
 endif()
 
 # setting timeout value as 15S
+set_tests_properties(test_run PROPERTIES TIMEOUT 120)
 set_tests_properties(test_sync_batch_norm_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cross_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_lod_tensor_to_selected_rows PROPERTIES TIMEOUT 200)
@@ -957,6 +960,7 @@ set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_profiler PROPERTIES TIMEOUT 120)
+set_tests_properties(test_inplace_eager_fluid PROPERTIES TIMEOUT 120)
 set_tests_properties(test_inplace_softmax_with_cross_entropy PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cross_entropy2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_fetch_unmerged PROPERTIES TIMEOUT 120)
@@ -1115,9 +1119,9 @@ set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_split_program PROPERTIES TIMEOUT 120)
 if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 150)
     set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 150)
     set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 30)
     set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
@@ -1155,6 +1159,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
         set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT 120)
         set_tests_properties(test_static_model_parallel PROPERTIES TIMEOUT 240)
         set_tests_properties(test_static_model_parallel_fused_feedforward PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_static_model_parallel_fused_attention PROPERTIES TIMEOUT 120)
         set_tests_properties(test_collective_split_embedding
             test_collective_split_embedding_none_divisible
             test_collective_split_row_linear
@@ -1172,6 +1177,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
             test_collective_global_scatter
             PROPERTIES LABELS "RUN_TYPE=DIST")
     endif()
+    set_tests_properties(test_paddle_multiprocessing PROPERTIES TIMEOUT 120)
     set_tests_properties(test_reducescatter_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_broadcast PROPERTIES TIMEOUT 120)
     set_tests_properties(test_reducescatter PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
new file mode 100644
index 0000000000000..a2b499a9e01c3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.contrib import sparsity
+from paddle.fluid.contrib.sparsity.supported_layer_list import supported_layers_and_prune_func_map
+from paddle.fluid.dygraph.layers import Layer, _convert_camel_to_snake
+
+
+class MyOwnLayer(Layer):
+    def __init__(self):
+        super(MyOwnLayer, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+static_tensor = None
+static_tensor_mask = None
+
+
+def my_own_pruning(tensor, m, n, mask_algo, param_name):
+    global static_tensor
+    global static_tensor_mask
+    if static_tensor is None:
+        static_tensor = np.random.rand(*tensor.shape).astype(np.float32)
+    if static_tensor_mask is None:
+        static_tensor_mask = np.random.rand(*tensor.shape).astype(np.float32)
+    return static_tensor, static_tensor_mask
+
+
+class TestASPAddSupportedLayer(unittest.TestCase):
+    def test_add_supported_layer_via_name(self):
+        sparsity.add_supported_layer("test_supported_1")
+        sparsity.add_supported_layer("test_supported_2", my_own_pruning)
+        sparsity.add_supported_layer(MyOwnLayer)
+        my_own_layer_name = _convert_camel_to_snake(MyOwnLayer.__name__)
+
+        self.assertTrue(
+            "test_supported_1" in supported_layers_and_prune_func_map)
+        self.assertTrue(
+            "test_supported_2" in supported_layers_and_prune_func_map)
+        self.assertTrue(
+            "test_supported_2" in supported_layers_and_prune_func_map)
+        self.assertTrue(supported_layers_and_prune_func_map["test_supported_2"]
+                        == my_own_pruning)
+        self.assertTrue(
+            my_own_layer_name in supported_layers_and_prune_func_map)
+
+
+class TestASPStaticCustomerizedPruneFunc(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+
+        self.main_program = fluid.Program()
+        self.startup_program = fluid.Program()
+
+        self.customer_prefix = "customer_layer"
+
+        def build_model():
+            img = fluid.data(
+                name='img', shape=[None, 3, 32, 32], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            hidden = fluid.layers.conv2d(
+                input=img, num_filters=4, filter_size=3, padding=2, act="relu")
+            hidden = fluid.layers.fc(input=hidden,
+                                     size=32,
+                                     act='relu',
+                                     name=self.customer_prefix)
+            hidden = fluid.layers.fc(input=hidden,
+                                     size=32,
+                                     act='relu',
+                                     name=self.customer_prefix)
+            hidden = fluid.layers.fc(input=hidden, size=32, act='relu')
+            prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+            return img, label, prediction
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.img, self.label, self.predict = build_model()
+            self.supported_layer_count_ref = 5
+
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+        self.exe = fluid.Executor(self.place)
+
+        sparsity.add_supported_layer(self.customer_prefix, my_own_pruning)
+
+    def test_inference_pruning(self):
+        self.exe.run(self.startup_program)
+
+        sparsity.prune_model(
+            self.main_program, mask_algo="mask_1d", with_mask=False)
+
+        supported_layer_count = 0
+        for param in self.main_program.global_block().all_parameters():
+            mat = np.array(fluid.global_scope().find_var(param.name).get_tensor(
+            ))
+            if sparsity.asp.ASPHelper._is_supported_layer(self.main_program,
+                                                          param.name):
+                supported_layer_count += 1
+                if (self.customer_prefix in param.name):
+                    self.assertLessEqual(
+                        np.sum(mat.flatten() - static_tensor.flatten()), 1e-4)
+                else:
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+        self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
+
+    def test_training_pruning(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            loss = fluid.layers.mean(
+                fluid.layers.cross_entropy(
+                    input=self.predict, label=self.label))
+            optimizer = sparsity.decorate(
+                fluid.optimizer.SGD(learning_rate=0.01))
+            optimizer.minimize(loss, self.startup_program)
+
+        self.exe.run(self.startup_program)
+
+        sparsity.prune_model(
+            self.main_program, mask_algo="mask_1d", with_mask=True)
+
+        supported_layer_count = 0
+        for param in self.main_program.global_block().all_parameters():
+            mat = np.array(fluid.global_scope().find_var(param.name).get_tensor(
+            ))
+            if sparsity.asp.ASPHelper._is_supported_layer(self.main_program,
+                                                          param.name):
+                mat_mask = np.array(fluid.global_scope().find_var(
+                    sparsity.asp.ASPHelper._get_mask_name(param.name))
+                                    .get_tensor())
+                supported_layer_count += 1
+                if (self.customer_prefix in param.name):
+                    self.assertLessEqual(
+                        np.sum(mat.flatten() - static_tensor.flatten()), 1e-4)
+                    self.assertLessEqual(
+                        np.sum(mat_mask.flatten() - static_tensor_mask.flatten(
+                        )), 1e-4)
+                else:
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat_mask.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+        self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 80bc206ae7b79..a730d21afa579 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -9,4 +9,12 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
     set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240)
     py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS})
     set_tests_properties(test_engine_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80)
+    py_test_modules(test_while_op_completion MODULES test_while_op_completion ENVS ${dist_ENVS})
+    py_test_modules(test_converter MODULES test_converter ENVS ${dist_ENVS})
+    set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
+
+    py_test_modules(test_tunable_variable MODULES test_tunable_variable ENVS ${dist_ENVS})
+    py_test_modules(test_tunable_space MODULES test_tunable_space ENVS ${dist_ENVS})
+    py_test_modules(test_recorder MODULES test_recorder ENVS ${dist_ENVS})
+    py_test_modules(test_trial MODULES test_trial ENVS ${dist_ENVS})
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/converter.py b/python/paddle/fluid/tests/unittests/auto_parallel/converter.py
new file mode 100644
index 0000000000000..e34f267b4237b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/converter.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle
+from paddle.distributed.auto_parallel.converter import Converter
+
+
+def test_convert():
+    rank_id = paddle.distributed.get_rank()
+    complete_tensor = np.arange(64).reshape([8, 8])
+    tensor_row = np.split(complete_tensor, 2, axis=0)
+    tensor_col = np.split(complete_tensor, 2, axis=1)
+    tensor_name = "tensor_0"
+    complet_strategy = {
+        tensor_name: {
+            "process_shape": [2],
+            "process_group": [0, 1],
+            "dims_mapping": [-1, -1]
+        }
+    }
+    row_strategy = {
+        tensor_name: {
+            "process_shape": [2],
+            "process_group": [0, 1],
+            "dims_mapping": [0, -1]
+        }
+    }
+    col_strategy = {
+        tensor_name: {
+            "process_shape": [2],
+            "process_group": [0, 1],
+            "dims_mapping": [-1, 0]
+        }
+    }
+
+    # test merge
+    tensor_dict = {tensor_name: tensor_row}
+    converter = Converter(tensor_dict, row_strategy, complet_strategy)
+    convert_tensor_dict = converter.convert()
+    assert np.equal(convert_tensor_dict[tensor_name], complete_tensor).all()
+
+    # test slice
+    tensor_dict = {tensor_name: [complete_tensor]}
+    converter = Converter(tensor_dict, complet_strategy, col_strategy)
+    convert_tensor_dict = converter.convert()
+    assert np.equal(convert_tensor_dict[tensor_name], tensor_col[rank_id]).all()
+
+    # test merge and slice
+    tensor_dict = {tensor_name: tensor_col}
+    converter = Converter(tensor_dict, col_strategy, row_strategy)
+    convert_tensor_dict = converter.convert()
+    assert np.equal(convert_tensor_dict[tensor_name], tensor_row[rank_id]).all()
+
+    # test merge and slice with prefix match
+    new_name = "tensor_1"
+    row_strategy = {
+        new_name: {
+            "process_shape": [2],
+            "process_group": [0, 1],
+            "dims_mapping": [0, -1]
+        }
+    }
+    converter = Converter(tensor_dict, col_strategy, row_strategy)
+    convert_tensor_dict = converter.convert(strict=False)
+    assert np.equal(convert_tensor_dict[new_name], tensor_row[rank_id]).all()
+
+
+if __name__ == "__main__":
+    test_convert()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py
new file mode 100644
index 0000000000000..fbadbb7d8c1cf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import sys
+import shutil
+import subprocess
+from paddle.distributed.fleet.launch_utils import run_with_coverage
+from paddle.distributed.auto_parallel.converter import Converter
+
+
+class TestConverter(unittest.TestCase):
+    def test_converter(self):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        launch_model_path = os.path.join(file_dir, "converter.py")
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
+
+        cmd = [sys.executable, "-u"] + coverage_args + [
+            "-m", "launch", "--gpus", "0,1", launch_model_path
+        ]
+
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
+
+        # Remove unnecessary files
+        log_path = os.path.join(file_dir, "log")
+        if os.path.exists(log_path):
+            shutil.rmtree(log_path)
+
+    def test_input_invalid(self):
+        with self.assertRaises(ValueError):
+            Converter({}, [], [])
+        with self.assertRaises(TypeError):
+            Converter([0, 1], [], [])
+        with self.assertRaises(ValueError):
+            Converter({"tmp_0": [0]}, {}, [])
+        with self.assertRaises(TypeError):
+            Converter({"tmp_0": [0]}, [0], [])
+
+        strategy_1 = {
+            'tmp_0': {
+                "process_shape": [1],
+                "process_group": [0],
+                "dims_mapping": [-1]
+            }
+        }
+        with self.assertRaises(TypeError):
+            Converter({"tmp_0": [0]}, strategy_1, [])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
index a7d51a7e176d4..d150da761aad3 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py
new file mode 100644
index 0000000000000..ab704a6a25714
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+from paddle.distributed.auto_parallel.tuner import recorder as rd
+
+
+class TestRecorder(unittest.TestCase):
+    def test_register(self):
+        recorder = rd.MetricsRecorder()
+        recorder.register("metric")
+        self.assertEqual(set(recorder.records.keys()), {"metric"})
+        self.assertEqual(recorder.records["metric"].direction, "min")
+
+    def test_exists(self):
+        recorder = rd.MetricsRecorder()
+        recorder.register("metric", direction="max")
+        self.assertTrue(recorder.exists("metric"))
+
+    def test_update(self):
+        recorder = rd.MetricsRecorder()
+        recorder.update("metric", 4, 1000)
+        self.assertEqual(recorder.records["metric"].direction, "min")
+        self.assertEqual(
+            recorder.get_records("metric"), [rd.MetricRecord(4, 1000)])
+
+    def test_get_records(self):
+        recorder = rd.MetricsRecorder()
+        recorder.update("metric", 1, step=0)
+        recorder.update("metric", 2, step=1)
+        recorder.update("metric", 3, step=2)
+        recorder.update("metric", 4, step=3)
+        self.assertEqual(
+            recorder.get_records("metric"), [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ])
+
+    def test_set_records(self):
+        recorder = rd.MetricsRecorder()
+        recorder.set_records(
+            "metric",
+            [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ], )
+        self.assertEqual(
+            recorder.get_records("metric"), [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ])
+
+    def test_get_best_value(self):
+        recorder = rd.MetricsRecorder()
+        recorder.register("metric_min", "min")
+        recorder.register("metric_max", "max")
+
+        recorder.set_records(
+            "metric_min",
+            [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ], )
+        self.assertEqual(recorder.get_best_value("metric_min"), 1)
+
+        recorder.set_records(
+            "metric_max",
+            [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ], )
+        self.assertEqual(recorder.get_best_value("metric_max"), 4)
+
+    def test_get_best_step(self):
+        recorder = rd.MetricsRecorder()
+
+        recorder.register("metric_min", "min")
+        recorder.set_records(
+            "metric_min",
+            [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ], )
+        self.assertEqual(recorder.get_best_step("metric_min"), 0)
+
+        recorder.register("metric_max", "max")
+        recorder.set_records(
+            "metric_max",
+            [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ], )
+        self.assertEqual(recorder.get_best_step("metric_max"), 3)
+
+    def test_get_statistics(self):
+        recorder = rd.MetricsRecorder()
+        records = [rd.MetricRecord(np.random.random(), i) for i in range(14)]
+        recorder.set_records("metric", records)
+        stats = recorder.get_statistics("metric")
+        records = [r.value for r in records]
+        self.assertEqual(stats["min"], np.min(records))
+        self.assertEqual(stats["max"], np.max(records))
+        self.assertEqual(stats["mean"], np.mean(records))
+        self.assertEqual(stats["median"], np.median(records))
+        self.assertEqual(stats["var"], np.var(records))
+        self.assertEqual(stats["std"], np.std(records))
+
+    def test_serialization(self):
+        recorder = rd.MetricsRecorder()
+        recorder.register("metric")
+        recorder.set_records(
+            "metric",
+            [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ], )
+        print(recorder.get_state())
+        new_recorder = rd.MetricsRecorder.from_state(recorder.get_state())
+        self.assertEqual(new_recorder.records.keys(), recorder.records.keys())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py
new file mode 100644
index 0000000000000..fc52d1c394eff
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.distributed.auto_parallel.tuner import tunable_space as ts
+from paddle.distributed.auto_parallel.tuner import trial as tr
+
+
+class TestTiral(unittest.TestCase):
+    def test_trial(self):
+        space = ts.TunableSpace()
+        space.choice("choice", [0, 1, 2, 3], default=2)
+        trial = tr.Trial(space, trial_id="trial-1")
+        trial.recorder.register("latency", direction="min")
+        trial.recorder.update("latency", 0.1, step=0)
+        trial.recorder.update("latency", 0.2, step=1)
+        trial.best_step = 0
+
+        self.assertEqual(trial.id, "trial-1")
+        self.assertEqual(trial.space.get_value("choice"), 2)
+        self.assertEqual(trial.best_step, 0)
+        self.assertEqual(trial.status, "RUNNING")
+
+    def test_serialization(self):
+        space = ts.TunableSpace()
+        space.int_range("int_range", start=1, stop=4, default=2)
+        trial = tr.Trial(space, trial_id="trial-2", status="COMPLETED")
+        trial.recorder.register("latency", direction="min")
+        trial.recorder.update("latency", 0.1, step=0)
+        trial.recorder.update("latency", 0.2, step=1)
+        trial.best_step = 0
+
+        new_trial = tr.Trial.from_state(trial.get_state())
+        self.assertEqual(new_trial.id, "trial-2")
+        self.assertEqual(new_trial.space.get_value("int_range"), 2)
+        self.assertEqual(new_trial.best_step, 0)
+        self.assertEqual(new_trial.status, "COMPLETED")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py
new file mode 100644
index 0000000000000..cb7104f9ef641
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.distributed.auto_parallel.tuner import tunable_space as ts
+
+
+class TestTunableSpace(unittest.TestCase):
+    def test_fixed(self):
+        space = ts.TunableSpace()
+        fixed = space.fixed("fixed", default=4)
+        self.assertEqual(space.values["fixed"], 4)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["fixed"].name, "fixed")
+
+        space.values["fixed"] = 2
+        self.assertEqual(space.get_value("fixed"), 2)
+        self.assertEqual(space.values, {"fixed": 2})
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["fixed"].name, "fixed")
+
+    def test_boolean(self):
+        space = ts.TunableSpace()
+        boolean = space.boolean("boolean")
+        self.assertEqual(space.values["boolean"], False)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["boolean"].name, "boolean")
+
+        space.values["boolean"] = True
+        self.assertEqual(space.get_value("boolean"), True)
+        self.assertEqual(space.values, {"boolean": True})
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["boolean"].name, "boolean")
+
+    def test_choice(self):
+        space = ts.TunableSpace()
+        choice = space.choice("choice", [1, 2, 3, 4], default=4)
+        self.assertEqual(space.values["choice"], 4)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["choice"].name, "choice")
+
+        space.values["choice"] = 2
+        self.assertEqual(space.get_value("choice"), 2)
+        self.assertEqual(space.values, {"choice": 2})
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["choice"].name, "choice")
+
+    def test_int_range(self):
+        space = ts.TunableSpace()
+        int_range = space.int_range("int_range", start=1, stop=4, default=2)
+        self.assertEqual(space.values["int_range"], 2)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["int_range"].name, "int_range")
+
+        space.values["int_range"] = 3
+        self.assertEqual(space.get_value("int_range"), 3)
+        self.assertEqual(space.values, {"int_range": 3})
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["int_range"].name, "int_range")
+
+    def test_float_range(self):
+        space = ts.TunableSpace()
+        float_range = space.float_range(
+            "float_range", start=0.4, stop=4.4, default=2.0)
+        self.assertEqual(space.values["float_range"], 2.0)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["float_range"].name, "float_range")
+
+        space.values["float_range"] = 3.0
+        self.assertEqual(space.get_value("float_range"), 3.0)
+        self.assertEqual(space.values, {"float_range": 3.0})
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["float_range"].name, "float_range")
+
+    def test_varibles(self):
+        space = ts.TunableSpace()
+        choice = space.choice("choice", [1, 2, 3, 4], default=4)
+        self.assertEqual(space.values["choice"], 4)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["choice"].name, "choice")
+
+        int_range = space.int_range("int_range", start=1, stop=4, default=2)
+        self.assertEqual(space.values["int_range"], 2)
+        self.assertEqual(len(space.variables), 2)
+        self.assertEqual(space.variables["int_range"].name, "int_range")
+
+    def test_not_populated_variable(self):
+        space = ts.TunableSpace()
+        choice = space.choice("choice", [1, 2, 3, 4], default=2)
+        self.assertEqual(choice, 2)
+
+    def test_populated_variable(self):
+        space = ts.TunableSpace()
+        space.values["choice"] = 2
+        choice = space.choice("choice", [1, 2, 3, 4], default=4)
+        self.assertEqual(choice, 2)
+
+        space["choice"] = 3
+        self.assertNotEqual(space.values["choice"], 2)
+        self.assertEqual(space.values["choice"], 3)
+
+    def test_state(self):
+        space = ts.TunableSpace()
+        choice = space.choice("choice", [1, 2, 3, 4], default=4)
+        int_range = space.int_range("int_range", start=1, stop=4, default=2)
+
+        new_space = space.from_state(space.get_state())
+        self.assertEqual(new_space.get_value("choice"), 4)
+        self.assertEqual(new_space.get_value("int_range"), 2)
+        self.assertEqual(len(new_space.variables), 2)
+        self.assertEqual(len(new_space.values), 2)
+
+        self.assertEqual(new_space.variables["choice"].name, "choice")
+        self.assertEqual(new_space.variables["choice"].default, 4)
+        self.assertEqual(new_space.variables["choice"].values, [1, 2, 3, 4])
+
+        self.assertEqual(new_space.variables["int_range"].name, "int_range")
+        self.assertEqual(new_space.variables["int_range"].default, 2)
+        self.assertEqual(new_space.variables["int_range"].start, 1)
+        self.assertEqual(new_space.variables["int_range"].stop, 4)
+        self.assertEqual(new_space.variables["int_range"].step, 1)
+        self.assertEqual(new_space.variables["int_range"].endpoint, False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py
new file mode 100644
index 0000000000000..c36fca7a9d09a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.distributed.auto_parallel.tuner import tunable_variable as tv
+
+
+class TestTunableVariable(unittest.TestCase):
+    def test_fixed(self):
+        fixed = tv.Fixed("fixed", True)
+        fixed = tv.Fixed.from_state(fixed.get_state())
+        self.assertEqual(fixed.default, True)
+        self.assertEqual(fixed.random(), True)
+
+        fixed = tv.Fixed("fixed", 1)
+        fixed = tv.Fixed.from_state(fixed.get_state())
+        self.assertEqual(fixed.default, 1)
+        self.assertEqual(fixed.random(), 1)
+
+    def test_boolean(self):
+        boolean = tv.Boolean("bool")
+        boolean = tv.Boolean.from_state(boolean.get_state())
+        self.assertEqual(boolean.default, False)
+        self.assertIn(boolean.random(), [True, False])
+        self.assertIn(boolean.random(1234), [True, False])
+
+        boolean = tv.Boolean("bool", True)
+        boolean = tv.Boolean.from_state(boolean.get_state())
+        self.assertEqual(boolean.default, True)
+        self.assertIn(boolean.random(), [True, False])
+        self.assertIn(boolean.random(1234), [True, False])
+
+    def test_choice(self):
+        choice = tv.Choice("choice", [1, 2, 3, 4])
+        choice = tv.Choice.from_state(choice.get_state())
+        self.assertEqual(choice.default, 1)
+        self.assertIn(choice.random(), [1, 2, 3, 4])
+        self.assertIn(choice.random(1234), [1, 2, 3, 4])
+
+        choice = tv.Choice("choice", [1, 2, 3, 4], default=2)
+        choice = tv.Choice.from_state(choice.get_state())
+        self.assertEqual(choice.default, 2)
+        self.assertIn(choice.random(), [1, 2, 3, 4])
+        self.assertIn(choice.random(1234), [1, 2, 3, 4])
+
+    def test_int_range(self):
+        int_range = tv.IntRange("int_range", start=1, stop=4, default=2)
+        int_range = tv.IntRange.from_state(int_range.get_state())
+        self.assertEqual(int_range.default, 2)
+        self.assertIn(int_range.random(), [1, 2, 3, 4])
+        self.assertIn(int_range.random(1234), [1, 2, 3, 4])
+        self.assertNotEqual(int_range.default, 4)
+
+        int_range = tv.IntRange(
+            "int_range", start=1, stop=8, step=2, default=3, endpoint=True)
+        int_range = tv.IntRange.from_state(int_range.get_state())
+        self.assertEqual(int_range.default, 3)
+        self.assertIn(int_range.random(), [1, 3, 5, 7])
+        self.assertIn(int_range.random(1234), [1, 3, 5, 7])
+        self.assertNotEqual(int_range.default, 2)
+
+    def test_float_range(self):
+        float_range = tv.FloatRange(
+            "float_range", start=0.4, stop=4.4, default=2.0)
+        float_range = tv.FloatRange.from_state(float_range.get_state())
+        self.assertEqual(float_range.default, 2.0)
+        self.assertGreater(float_range.random(), 0.4)
+        self.assertLess(float_range.random(1234), 4.4)
+        self.assertNotAlmostEqual(float_range.random(), 1)
+        self.assertNotAlmostEqual(float_range.random(), 4.4)
+
+        float_range = tv.FloatRange(
+            "float_range",
+            start=0.4,
+            stop=8.4,
+            step=2.0,
+            default=3.0,
+            endpoint=True)
+        float_range = tv.FloatRange.from_state(float_range.get_state())
+        self.assertEqual(float_range.default, 3.0)
+        self.assertGreater(float_range.random(), 0.4)
+        self.assertLessEqual(float_range.random(1234), 8.4)
+        self.assertNotAlmostEqual(float_range.random(), 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
new file mode 100644
index 0000000000000..1179fd9a9f088
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import numpy as np
+import paddle.nn as nn
+import paddle.utils as utils
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.distributed.auto_parallel as auto
+
+from paddle.distributed import fleet
+from paddle.distributed.auto_parallel.completion import Completer
+from paddle.distributed.auto_parallel.partitioner import Partitioner
+from paddle.distributed.auto_parallel.utils import make_data_unshard
+from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
+from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context
+from paddle.distributed.auto_parallel.operators import find_best_compatible_distributed_operator_impl
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
+
+paddle.enable_static()
+
+batch_size = 4
+epoch_num = 10
+hidden_size = 1024
+sequence_len = 512
+_g_process_mesh = [[0, 1], [2, 3]]
+
+
+def get_random_inputs_and_labels(input_shape, label_shape):
+    input = np.random.random(size=input_shape).astype('float32')
+    label = np.random.random(size=label_shape).astype('float32')
+    return input, label
+
+
+def batch_generator_creator():
+    def __reader__():
+        for _ in range(batch_size):
+            batch_input, batch_label = get_random_inputs_and_labels(
+                [batch_size, sequence_len, hidden_size],
+                [batch_size, sequence_len, 1])
+            yield batch_input, batch_label
+
+    return __reader__
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        param_initializer = nn.initializer.Normal(
+            mean=0.0, std=initializer_range)
+
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.linear0 = nn.Linear(
+            d_model,
+            dim_feedforward,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+        self.linear1 = nn.Linear(
+            dim_feedforward,
+            d_model,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+
+    def forward(self, input):
+        out = self.norm(input)
+        auto.shard_tensor(
+            self.linear0.weight,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [-1, 0]
+            })
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        auto.shard_tensor(
+            self.linear1.weight,
+            dist_attr={
+                "process_mesh": _g_process_mesh[1],
+                "dims_mapping": [0, -1]
+            })
+        out = self.linear1(out)
+
+        return out
+
+
+def loop_cond(i, loop_len, input_array):
+    return i < loop_len
+
+
+def loop_body(i, loop_len, input_array):
+    pre_input = paddle.tensor.array_read(array=input_array, i=i)
+    mlp_while0 = MLPLayer(
+        hidden_size=hidden_size,
+        intermediate_size=4 * hidden_size,
+        dropout_ratio=0.1,
+        initializer_range=0.02)
+
+    mlp_while1 = MLPLayer(
+        hidden_size=hidden_size,
+        intermediate_size=4 * hidden_size,
+        dropout_ratio=0.1,
+        initializer_range=0.02)
+
+    output = mlp_while0(pre_input)
+    cur_pred = mlp_while1(output)
+    # 更新循环条件
+    i = paddle.increment(x=i, value=1)
+    paddle.tensor.array_write(cur_pred, array=input_array, i=i)
+    return i, loop_len, input_array
+
+
+def get_program():
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.semi_auto = True
+    # fleet.init(is_collective=True, strategy=dist_strategy)
+
+    train_program = static.Program()
+    start_program = static.Program()
+    with static.program_guard(train_program, start_program):
+
+        # 循环计数器
+        i = paddle.full(shape=[1], fill_value=0, dtype='int64')
+        # 循环次数
+        loop_len = paddle.full(shape=[1], fill_value=epoch_num, dtype='int64')
+
+        # input
+        input = static.data(
+            name="input",
+            shape=[batch_size, sequence_len, hidden_size],
+            dtype='float32')
+        label = static.data(
+            name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
+        data_holder = [input, label]
+        # dataloader
+        dataloader = paddle.io.DataLoader.from_generator(
+            feed_list=data_holder, capacity=4 * batch_size, iterable=False)
+        dataloader.set_batch_generator(
+            batch_generator_creator(), places=paddle.static.cuda_places())
+        # data dist_attr
+        auto.shard_tensor(
+            input,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [-1, -1, -1]
+            })
+        auto.shard_tensor(
+            label,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [-1, -1, -1]
+            })
+
+        mlp_start = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_start(input)
+
+        input_array = paddle.tensor.array_write(pred, i)
+        i, loop_len, input_array = static.nn.while_loop(
+            cond=loop_cond,
+            body=loop_body,
+            loop_vars=[i, loop_len, input_array])
+        end_pred = paddle.tensor.array_read(array=input_array, i=i)
+
+        mlp_end = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_end(end_pred)
+
+        error_cost = paddle.nn.functional.square_error_cost(pred, label)
+        loss = paddle.mean(error_cost)
+
+    return train_program, start_program, dataloader, i, loss
+
+
+class TestMLP(unittest.TestCase):
+    def test_completer(self):
+        train_program, start_program, dataloader, i, loss = get_program()
+        dist_context = DistributedContext()
+        completer = Completer(dist_context)
+        complete_train_program = completer.complete_forward_annotation(
+            train_program)
+        # print_program_with_dist_attr(complete_train_program, dist_context)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
index 2277c69674b3f..22692fa5debfc 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
@@ -32,6 +32,7 @@
 from paddle.distributed.auto_parallel.utils import save_distributed_checkpoint, load_distributed_checkpoint, load_checkpoint_into_program
 from paddle.distributed.auto_parallel.utils import get_dist_attr, merge_and_slice_parameter, load_parameter_into_program
 from paddle.distributed.auto_parallel.reshard import HAS_SENT, HAS_RECV, HAS_ALLGATHER
+from paddle.distributed.auto_parallel.dist_context import set_default_distributed_context
 
 paddle.enable_static()
 _global_parallel_strategy = None
@@ -185,6 +186,7 @@ def tearDown(self):
             str(paddle.distributed.get_rank())))
 
     def test_mlp_mp2pp(self):
+        set_default_distributed_context(None)
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
@@ -211,6 +213,7 @@ def test_mlp_mp2pp(self):
                           fetch_list=[loss])
         last_res = res[0]
 
+        set_default_distributed_context(None)
         _global_parallel_strategy = "pp"
         _global_process_mesh = auto.ProcessMesh([0, 1])
         global PP_MESH_0
@@ -266,6 +269,7 @@ def tearDown(self):
             str(paddle.distributed.get_rank())))
 
     def test_mlp_pp2mp(self):
+        set_default_distributed_context(None)
         global _global_parallel_strategy
         _global_parallel_strategy = "pp"
         global _global_process_mesh
@@ -302,6 +306,7 @@ def test_mlp_pp2mp(self):
         if paddle.distributed.get_rank() in [1]:
             last_res = res[0]
 
+        set_default_distributed_context(None)
         _global_parallel_strategy = "mp"
         _global_process_mesh = auto.ProcessMesh([0, 1])
 
@@ -345,6 +350,7 @@ def setUp(self):
         np.random.seed(2021)
 
     def test_input_invalid(self):
+        set_default_distributed_context(None)
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py
index cc2e14d6d6c2e..341ec852c5219 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py
@@ -50,3 +50,7 @@ class TestExponentialFamilyException(unittest.TestCase):
     def test_entropy_exception(self):
         with self.assertRaises(NotImplementedError):
             paddle.distribution.ExponentialFamily.entropy(self.dist)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_kl.py b/python/paddle/fluid/tests/unittests/distribution/test_kl.py
index a1413722446e2..55358380c8b23 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_kl.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_kl.py
@@ -112,3 +112,7 @@ def test_kl_expfamily_expfamily(self):
             kl._kl_expfamily_expfamily(self.p, self.q),
             rtol=config.RTOL.get(config.DEFAULT_DTYPE),
             atol=config.ATOL.get(config.DEFAULT_DTYPE))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
index 06935e212c3cb..fb01fd46c0d28 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
@@ -14,8 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import shutil
 import numpy as np
 import argparse
+import tempfile
 import ast
 import time
 import paddle
@@ -88,7 +91,8 @@ def train_mlp(model,
               batch_size=100,
               use_pure_fp16=False,
               accumulate_grad=False,
-              opt_group=False):
+              opt_group=False,
+              save_model=False):
     if sharding_stage == "dp":
         hcg = fleet.get_hybrid_communicate_group()
         group = hcg.get_check_parallel_group()
@@ -147,6 +151,9 @@ def train_mlp(model,
         if accumulate_grad:
             optimizer.step()
             optimizer.clear_grad()
+
+    if save_model:
+        return model, optimizer
     return model.parameters()
 
 
@@ -158,11 +165,13 @@ def test_dp_stage2():
     mlp3 = MLP()
     mlp4 = MLP()
     mlp5 = MLP()
+    mlp6 = MLP()
     mlp1.set_state_dict(state_dict)
     mlp2.set_state_dict(state_dict)
     mlp3.set_state_dict(state_dict)
     mlp4.set_state_dict(state_dict)
     mlp5.set_state_dict(state_dict)
+    mlp6.set_state_dict(state_dict)
 
     # DP VS stage2
     dp_params = train_mlp(
@@ -186,10 +195,29 @@ def test_dp_stage2():
 
     # stage2 param list VS param group
     stage2_params = train_mlp(
-        mlp2, sharding_stage=2, use_pure_fp16=False, opt_group=True)
+        mlp5, sharding_stage=2, use_pure_fp16=False, opt_group=True)
     for i in range(len(dp_params)):
         np.testing.assert_allclose(
             dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6)
+
+    # save/load model
+    output_dir = tempfile.mkdtemp()
+    model_file = os.path.join(output_dir, "model.pdmodel")
+    optimizer_file = os.path.join(output_dir, "model.pdopt")
+    model_stage2, optimizer_stage2 = train_mlp(
+        mlp6,
+        sharding_stage=2,
+        use_pure_fp16=False,
+        opt_group=False,
+        save_model=True)
+    paddle.save(model_stage2.state_dict(), model_file)
+    paddle.save(optimizer_stage2.state_dict(), optimizer_file)
+    m_state_dict = paddle.load(model_file)
+    opt_state_dict = paddle.load(optimizer_file)
+    model_stage2.set_state_dict(m_state_dict)
+    optimizer_stage2.set_state_dict(opt_state_dict)
+    shutil.rmtree(output_dir)
+
     return
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
index bbbcb621fd466..82821cd7ee644 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
@@ -14,6 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import shutil
+import tempfile
 import numpy as np
 import argparse
 import ast
@@ -84,7 +87,8 @@ def train_mlp(model,
               batch_size=100,
               opt_group=False,
               sync_comm=False,
-              test_minimize=False):
+              test_minimize=False,
+              save_model=False):
     group = paddle.distributed.new_group([0, 1])
     if opt_group:
         optimizer = optimizer_setting(
@@ -162,12 +166,15 @@ def train_mlp(model,
             optimizer.clear_grad()
     if sharding_stage == 3:
         model.get_all_parameters()
+
+    if save_model:
+        return model, optimizer
     return model.parameters()
 
 
 def test_stage2_stage3():
-    mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6, mlp7, mlp8, mlp9 = MLP(), MLP(
-    ), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP()
+    mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6, mlp7, mlp8, mlp9, mlp10 = MLP(
+    ), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP()
     state_dict = mlp.state_dict()
     mlp1.set_state_dict(state_dict)
     mlp2.set_state_dict(state_dict)
@@ -178,6 +185,7 @@ def test_stage2_stage3():
     mlp7.set_state_dict(state_dict)
     mlp8.set_state_dict(state_dict)
     mlp9.set_state_dict(state_dict)
+    mlp10.set_state_dict(state_dict)
 
     # fp32 
     stage2_params = train_mlp(
@@ -238,9 +246,27 @@ def test_stage2_stage3():
         np.testing.assert_allclose(
             stage3_params[i].numpy(), stage3_params_re[i].numpy(), rtol=1e-6)
 
+    # save/load model
+    output_dir = tempfile.mkdtemp()
+    model_file = os.path.join(output_dir, "model.pdmodel")
+    optimizer_file = os.path.join(output_dir, "model.pdopt")
+    model_stage3, optimizer_stage3 = train_mlp(
+        mlp9,
+        sharding_stage=3,
+        use_pure_fp16=False,
+        opt_group=False,
+        save_model=True)
+    paddle.save(model_stage3.state_dict(), model_file)
+    paddle.save(optimizer_stage3.state_dict(), optimizer_file)
+    m_state_dict = paddle.load(model_file)
+    opt_state_dict = paddle.load(optimizer_file)
+    model_stage3.set_state_dict(m_state_dict)
+    optimizer_stage3.set_state_dict(opt_state_dict)
+    shutil.rmtree(output_dir)
+
     # check optimizer.minimize() error
     train_mlp(
-        mlp9,
+        mlp10,
         sharding_stage=3,
         use_pure_fp16=False,
         opt_group=False,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
index cac64c7391351..2b8307461b8f5 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -27,6 +27,7 @@
 from paddle.fluid.optimizer import AdamOptimizer
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
+from paddle.fluid.framework import _test_eager_guard
 
 from predictor_utils import PredictorTools
 
@@ -155,6 +156,13 @@ def test_mnist_to_static(self):
             np.allclose(dygraph_loss, static_loss),
             msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss,
                                                             static_loss))
+        with _test_eager_guard():
+            dygraph_loss = self.train_dygraph()
+            static_loss = self.train_static()
+            self.assertTrue(
+                np.allclose(dygraph_loss, static_loss),
+                msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss,
+                                                                static_loss))
 
     def test_mnist_declarative_cpu_vs_mkldnn(self):
         dygraph_loss_cpu = self.train_dygraph()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
index 06d69daa75d1c..d05be03bbfb19 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
@@ -223,6 +223,12 @@ def dyfunc_len_paddle_shape():
         print(x)
 
 
+def dyfunc_dict_assign_shape():
+    x = paddle.to_tensor([1, 2])
+    a = {}
+    a['shape'] = x.shape[0]
+
+
 # 1. Basic tests without control flow
 class TestTensorShapeBasic(unittest.TestCase):
     def setUp(self):
@@ -592,6 +598,8 @@ class TestPaddleShape(unittest.TestCase):
     def test_paddle_shape(self):
         func = paddle.jit.to_static(dyfunc_len_paddle_shape)
         self.assertEqual('paddle.shape(x)' in func.code, True)
+        func = paddle.jit.to_static(dyfunc_dict_assign_shape)
+        self.assertEqual("__static_convert_var_shape_suffix" in func.code, True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/init_process_group.py b/python/paddle/fluid/tests/unittests/init_process_group.py
new file mode 100644
index 0000000000000..90926b1a021d3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/init_process_group.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import random
+import numpy as np
+import os
+import shutil
+
+import paddle
+from paddle.fluid import core
+import datetime
+from datetime import timedelta
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+
+class TestProcessGroupFp32(unittest.TestCase):
+    def setUp(self):
+        self.config()
+
+    def config(self):
+        pass
+
+    def test_init_process_group(self):
+        paddle.distributed.collective._init_parallel_env()
+        paddle.distributed.collective._new_group()
+        with self.assertRaises(ValueError):
+            paddle.distributed.collective._new_group(
+                backend="gloo", group_name="_default_pg")
+        print("test ok\n")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
index 959700ad743b4..79a2430a16170 100644
--- a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
@@ -4,5 +4,11 @@ if(WITH_IPU)
 
     foreach(TEST_OP ${TEST_OPS})
         py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+        # set all UTs timeout to 200s
+        set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 200)    
     endforeach(TEST_OP)
+
+    set_tests_properties(test_conv_op_ipu PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_elemetwise_x_op_ipu PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_reduce_x_op_ipu PROPERTIES TIMEOUT 600)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
index 4f17c90de72ad..35f4ca17d5eba 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
@@ -98,5 +98,117 @@ def test(self):
         self.check(output_dict)
 
 
+class TestAssignFp32Value(TestBase):
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3, 1])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+        data = np.random.uniform(size=[2, 3, 1])
+        self.assign_fp32 = data.astype(np.float32)
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                assign = paddle.assign(self.assign_fp32)
+                out = paddle.fluid.layers.elementwise_add(x, assign)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+
+class TestAssignBoolValue(TestBase):
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3, 1])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+        data = np.random.choice([True, False], size=(2, 3, 1))
+        self.assign_bool = data.astype(np.bool)
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+                x = paddle.less_than(x, x)
+                assign = paddle.assign(self.assign_bool)
+                out = paddle.logical_and(x, assign)
+                out = paddle.cast(out, 'float32')
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
index 1dab958c1ecbc..c640cd441f1b2 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
@@ -115,7 +115,7 @@ def test(self):
 
 class TestCase1(TestBase):
     def set_atol(self):
-        self.atol = 1e-7
+        self.atol = 1e-6
         self.rtol = 1e-6
         self.atol_fp16 = 1e-3
         self.rtol_fp16 = 1e-3
@@ -129,7 +129,7 @@ def set_op_attrs(self):
 
 class TestCase2(TestBase):
     def set_atol(self):
-        self.atol = 1e-7
+        self.atol = 1e-6
         self.rtol = 1e-6
         self.atol_fp16 = 1e-3
         self.rtol_fp16 = 1e-3
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
index 05a37dcb3d514..934ad10142827 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
@@ -22,33 +22,18 @@
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
-class TestBase(IPUOpTest):
+class TestGreaterThan(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_data_feed()
-        self.set_feed_attr()
-        self.set_op_attrs()
+        self.set_test_op()
 
     @property
     def fp16_enabled(self):
         return True
 
-    def set_data_feed(self):
-        x = np.random.randn(3, 4, 5)
-        y = np.random.randn(3, 4, 5)
-        self.feed_fp32 = {
-            "x": x.astype(np.float32),
-            "y": y.astype(np.float32),
-        }
-        self.feed_fp16 = {
-            "x": x.astype(np.float16),
-            "y": y.astype(np.float16),
-        }
-
-    def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
-        self.feed_list = list(self.feed_fp32.keys())
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.greater_than
 
     def set_op_attrs(self):
         self.attrs = {}
@@ -71,7 +56,7 @@ def _test_base(self, exec_mode):
                     shape=self.feed_shape[1],
                     dtype='float32')
 
-                out = paddle.fluid.layers.greater_than(x, y, **self.attrs)
+                out = self.op(x, y, **self.attrs)
 
                 fetch_list = [out.name]
 
@@ -102,7 +87,7 @@ def _test_base(self, exec_mode):
             result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
-    def test(self):
+    def run_test_base(self):
         output_dict = {}
         for mode in ExecutionMode:
             if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
@@ -111,29 +96,73 @@ def test(self):
 
         self.check(output_dict)
 
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_data_feed0(self):
+        x = np.random.randn(3, 4, 5)
+        y = np.random.randn(3, 4, 5)
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "y": y.astype(np.float32),
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "y": y.astype(np.float16),
+        }
+        self.set_feed_attr()
 
-class TestCase1(TestBase):
-    def set_data_feed(self):
+    def set_data_feed1(self):
         x = np.ones([1, 10])
         y = np.ones([10])
         self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
         self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+        self.set_feed_attr()
 
-
-class TestCase2(TestBase):
-    def set_data_feed(self):
+    def set_data_feed2(self):
         x = np.ones([1, 10])
         y = np.zeros([1, 10])
         self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
         self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+        self.set_feed_attr()
 
-
-class TestCase3(TestBase):
-    def set_data_feed(self):
+    def set_data_feed3(self):
         x = np.zeros([1, 10])
         y = np.ones([1, 10])
         self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
         self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+        self.set_feed_attr()
+
+    def test_case0(self):
+        self.set_data_feed0()
+        self.set_op_attrs()
+        self.run_test_base()
+
+    def test_case1(self):
+        self.set_data_feed1()
+        self.set_op_attrs()
+        self.run_test_base()
+
+    def test_case2(self):
+        self.set_data_feed2()
+        self.set_op_attrs()
+        self.run_test_base()
+
+    def test_case3(self):
+        self.set_data_feed3()
+        self.set_op_attrs()
+        self.run_test_base()
+
+
+class TestLessThan(TestGreaterThan):
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.less_than
+
+
+class TestEqual(TestGreaterThan):
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.equal
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py
deleted file mode 100644
index aa6c05dc59a87..0000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
-import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestBase(IPUOpTest):
-    def setUp(self):
-        self.set_atol()
-        self.set_feed()
-        self.set_feed_attr()
-        self.set_attrs()
-
-    def set_feed(self):
-        np_data = np.random.uniform(low=-1, high=1, size=[1, 3, 100, 100])
-        self.feed_ipu = {"x": np_data.astype('float16')}
-        self.feed_cpu = {"x": np_data.astype('float32')}
-
-    def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed_cpu.values()]
-        self.feed_list = list(self.feed_cpu.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed_cpu.values()
-        ]
-
-    def set_attrs(self):
-        self.attrs = {}
-
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-
-        with fluid.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                conv1 = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-                conv2 = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-                add1 = conv1 + conv2
-                conv3 = paddle.static.nn.conv2d(
-                    add1, num_filters=8, filter_size=8, bias_attr=False)
-                out = paddle.fluid.layers.relu(conv3, **self.attrs)
-                fetch_list = [out.name]
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            feed = self.feed_ipu if run_ipu else self.feed_cpu
-            if run_ipu:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=False)
-                ipu_strategy.SetHalfConfig(enable_fp16=True)
-                program = compiler.IPUCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                feed_list = self.feed_list
-                program = main_prog
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(res0.shape == res1.shape)
-        mae = np.mean(np.abs(res0.flatten() - res1.flatten()))
-        print("mae is ", mae)
-        self.assertTrue(mae < 0.001)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
index 026b19eccf187..76ab1a2c3f311 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
+import unittest
 
 import numpy as np
-import unittest
 import paddle
+import paddle.static
 
 paddle.enable_static()
 
@@ -26,30 +26,31 @@
 class TestIpuShard(unittest.TestCase):
     def _test(self):
         # build graph
-        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
-        b = a + 2  # scale : scale * x + bias, ipu_index : no
+        main_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog):
+            a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
+            b = a + 2  # scale : scale * x + bias, ipu_index : no
+
+            with paddle.static.ipu_shard_guard(index=1):
+                c = b + 1  # scale, ipu_index : 1
+                with paddle.static.ipu_shard_guard(index=2):
+                    d = c * 2  # scale, ipu_index : 2
+                with paddle.static.ipu_shard_guard(index=3):
+                    e = d + 3  # scale, ipu_index : 3
+                    with paddle.static.ipu_shard_guard(index=1):
+                        e = e + 3  # scale, ipu_index : 1
+                        with paddle.static.ipu_shard_guard(index=2):
+                            e = e + 3  # scale, ipu_index : 2
+
+            with paddle.static.ipu_shard_guard(index=1):
+                f = paddle.tensor.pow(e, 2.0)  # pow, ipu_index : 1
 
-        with paddle.static.ipu_shard_guard(index=1):
-            c = b + 1  # scale, ipu_index : 1
             with paddle.static.ipu_shard_guard(index=2):
-                d = c * 2  # scale, ipu_index : 2
-            with paddle.static.ipu_shard_guard(index=3):
-                e = d + 3  # scale, ipu_index : 3
-                with paddle.static.ipu_shard_guard(index=1):
-                    e = e + 3  # scale, ipu_index : 1
-                    with paddle.static.ipu_shard_guard(index=2):
-                        e = e + 3  # scale, ipu_index : 2
-
-        with paddle.static.ipu_shard_guard(index=1):
-            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_index : 1
+                g = f - 1  # scale, ipu_index : 2
 
-        with paddle.static.ipu_shard_guard(index=2):
-            g = f - 1  # scale, ipu_index : 2
-
-        h = g + 1  # scale, ipu_index : no
+            h = g + 1  # scale, ipu_index : no
 
         ipu_index_list = []
-        main_prog = paddle.static.default_main_program()
         for op in main_prog.global_block().ops:
             if op.desc.has_attr("ipu_index"):
                 ipu_index_list.append(op.desc.attr("ipu_index"))
@@ -69,30 +70,31 @@ def test_ipu_shard(self):
 class TestIpuPipeline(unittest.TestCase):
     def _test(self):
         # build graph
-        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
-        b = a + 2  # scale : scale * x + bias, ipu_stage : no
+        main_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog):
+            a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
+            b = a + 2  # scale : scale * x + bias, ipu_stage : no
+
+            with paddle.static.ipu_shard_guard(stage=1):
+                c = b + 1  # scale, ipu_stage : 1
+                with paddle.static.ipu_shard_guard(stage=2):
+                    d = c * 2  # scale, ipu_stage : 2
+                with paddle.static.ipu_shard_guard(stage=3):
+                    e = d + 3  # scale, ipu_stage : 3
+                    with paddle.static.ipu_shard_guard(stage=1):
+                        e = e + 3  # scale, ipu_stage : 1
+                        with paddle.static.ipu_shard_guard(stage=2):
+                            e = e + 3  # scale, ipu_stage : 2
+
+            with paddle.static.ipu_shard_guard(stage=1):
+                f = paddle.tensor.pow(e, 2.0)  # pow, ipu_stage : 1
 
-        with paddle.static.ipu_shard_guard(stage=1):
-            c = b + 1  # scale, ipu_stage : 1
             with paddle.static.ipu_shard_guard(stage=2):
-                d = c * 2  # scale, ipu_stage : 2
-            with paddle.static.ipu_shard_guard(stage=3):
-                e = d + 3  # scale, ipu_stage : 3
-                with paddle.static.ipu_shard_guard(stage=1):
-                    e = e + 3  # scale, ipu_stage : 1
-                    with paddle.static.ipu_shard_guard(stage=2):
-                        e = e + 3  # scale, ipu_stage : 2
-
-        with paddle.static.ipu_shard_guard(stage=1):
-            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_stage : 1
-
-        with paddle.static.ipu_shard_guard(stage=2):
-            g = f - 1  # scale, ipu_stage : 2
+                g = f - 1  # scale, ipu_stage : 2
 
-        h = g + 1  # scale, ipu_stage : no
+            h = g + 1  # scale, ipu_stage : no
 
         ipu_index_list = []
-        main_prog = paddle.static.default_main_program()
         for op in main_prog.global_block().ops:
             if op.desc.has_attr("ipu_stage"):
                 ipu_index_list.append(op.desc.attr("ipu_stage"))
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
index f120f5594914e..debd9ed19827c 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
@@ -26,7 +26,13 @@ class TestIpuStrategy(unittest.TestCase):
     def test_set_options(self):
         ipu_strategy = paddle.static.IpuStrategy()
         all_option_names = ipu_strategy._ipu_strategy.get_all_option_names()
+        skip_options = []
+        skip_options.append('random_seed')
+
         for option_name in all_option_names:
+            if option_name in skip_options:
+                continue
+
             option = ipu_strategy._ipu_strategy.get_option(option_name)
             option_type = option['type']
             option_value = option['value']
@@ -38,9 +44,13 @@ def test_set_options(self):
                 set_value = not option_value
             else:
                 continue
-            ipu_strategy.set_options({option_name: set_value})
-            new_value = ipu_strategy.get_option(option_name)
-            assert new_value == set_value, f"set {option_name} to {set_value} failed"
+
+            try:
+                ipu_strategy.set_options({option_name: set_value})
+                new_value = ipu_strategy.get_option(option_name)
+                assert new_value == set_value, f"set {option_name} to {set_value} failed"
+            except:
+                raise Exception(f"set {option_name} to {set_value} failed")
 
     def test_set_string_options(self):
         ipu_strategy = paddle.static.IpuStrategy()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py b/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py
similarity index 51%
rename from python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py
rename to python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py
index 58a88c113fc0b..05572a72ea8b2 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py
@@ -1,4 +1,4 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,65 +16,65 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.nn.functional as F
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
-class TestRelu(IPUOpTest):
+class TestLogicalAnd(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.init_op()
+        self.set_test_op()
 
-    def init_op(self):
-        self.op = paddle.fluid.layers.relu
+    @property
+    def fp16_enabled(self):
+        return False
 
-    def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.logical_and
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype=self.feed_dtype[0])
-                out = self.op(x, **self.attrs)
+                y = paddle.static.data(
+                    name=self.feed_list[1],
+                    shape=self.feed_shape[1],
+                    dtype=self.feed_dtype[1])
 
-                fetch_list = [out.name]
+                out = self.op(x, y, **self.attrs)
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IpuCompiler(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
@@ -84,42 +84,37 @@ def _test_base(self, run_ipu=True):
             return result[0]
 
     def run_test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).astype(np.int32)
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        self.check(output_dict, check_shape=True)
 
-        self.assertTrue(res0.shape == res1.shape)
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
+        self.feed_list = list(self.feed.keys())
+        self.feed_dtype = ['bool', 'bool']
 
-    def test_case0(self):
+    def set_data_feed0(self):
+        x = np.random.choice([True, False], size=(1, 3, 5, 5))
+        y = np.random.choice([True, False], size=(1, 3, 5, 5))
         self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
+            "x": x.astype('bool'),
+            "y": y.astype('bool'),
         }
-        self.attrs = {}
         self.set_feed_attr()
-        self.run_test_base()
-
 
-class TestTanh(TestRelu):
-    def init_op(self):
-        self.op = F.tanh
-
-
-class TestLog(TestRelu):
-    def init_op(self):
-        self.op = paddle.fluid.layers.log
-
-
-class TestSigmoid(TestRelu):
-    def init_op(self):
-        self.op = F.sigmoid
+    def test_case0(self):
+        self.set_data_feed0()
+        self.set_op_attrs()
+        self.run_test_base()
 
 
-class TestSqrt(TestRelu):
-    def init_op(self):
-        self.op = paddle.fluid.layers.sqrt
+class TestLogicalOr(TestLogicalAnd):
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.logical_or
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py
new file mode 100644
index 0000000000000..33a5dc888c245
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py
@@ -0,0 +1,110 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data1 = np.array([[1], [1], [3], [0]])
+
+        self.feed = {'x': data1.astype(np.int32)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
+        self.feed_list = list(self.feed.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {"depth": 4, "allow_out_of_range": False}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='int32')
+
+                out = paddle.fluid.layers.one_hot(x, **self.attrs)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+
+            return result[0]
+
+    def test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if (mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled):
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+@unittest.skip('does not support allow_out_of_range=True')
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"depth": 4, "allow_out_of_range": True}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py
new file mode 100644
index 0000000000000..79fc9b04e1674
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py
@@ -0,0 +1,110 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data1 = np.array([[1], [1], [3], [0]])
+
+        self.feed = {'x': data1.astype(np.int32)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
+        self.feed_list = list(self.feed.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {"depth": 4, "allow_out_of_range": False}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='int32')
+
+                out = paddle.fluid.input.one_hot(x, **self.attrs)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+
+            return result[0]
+
+    def test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if (mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled):
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+@unittest.skip('does not support allow_out_of_range=True')
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"depth": 4, "allow_out_of_range": True}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
index 1cc10da3d7344..bc9d05c4a87ec 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
@@ -91,6 +91,15 @@ def _test_optimizer(self, run_ipu=True):
                 ipu_strategy = paddle.static.IpuStrategy()
                 ipu_strategy.set_graph_config(is_training=True)
                 ipu_strategy.loss_scaling = self.attrs["loss_scaling"]
+                if "use_no_bias_optimizer" in self.attrs.keys():
+                    ipu_strategy.set_options({
+                        "use_no_bias_optimizer":
+                        self.attrs["use_no_bias_optimizer"]
+                    })
+                if "accl1_type" in self.attrs.keys():
+                    ipu_strategy.set_options({
+                        "accl1_type": self.attrs["accl1_type"]
+                    })
                 program = paddle.static.IpuCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
                                                                   fetch_list)
@@ -141,6 +150,28 @@ def set_attrs(self):
         }
 
 
+@unittest.skip('cpu do not support AdamNoBias')
+class TestAdamNoBias(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'adam',
+            "weight_decay": 0.0,
+            "loss_scaling": 4.0,
+            "use_no_bias_optimizer": True,
+        }
+
+
+@unittest.skip('cpu do not support FLOAT16')
+class TestAdamCase3(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'adam',
+            "weight_decay": 0.0,
+            "loss_scaling": 4.0,
+            "accl1_type": "FLOAT16",
+        }
+
+
 @unittest.skip('seems cpu output wrong')
 class TestLambCase1(TestBase):
     def set_attrs(self):
@@ -161,5 +192,27 @@ def set_attrs(self):
         }
 
 
+@unittest.skip('cpu do not support LambNoBias')
+class TestLambNoBias(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'lamb',
+            "weight_decay": 0.1,
+            "loss_scaling": 6.0,
+            "use_no_bias_optimizer": True
+        }
+
+
+@unittest.skip('cpu do not support FLOAT16')
+class TestLambCase2(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'lamb',
+            "weight_decay": 0.1,
+            "loss_scaling": 6.0,
+            "accl1_type": "FLOAT16"
+        }
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
index 3a69487306208..ba6eb4d38bcf2 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
@@ -95,12 +95,9 @@ def _test_base(self, save_otherwise_load):
                     is_training=self.attrs['is_training'])
                 ipu_strategy.set_precision_config(
                     enable_fp16=self.attrs['enable_fp16'])
-                ipu_strategy.set_options({
-                    'save_per_n_step': self.attrs['save_at_step']
-                })
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog, ipu_strategy=ipu_strategy).compile(
-                        self.feed_list, fetch_list)
+                ipu_program = paddle.static.IpuCompiledProgram(
+                    main_prog, ipu_strategy=ipu_strategy)
+                program = ipu_program.compile(self.feed_list, fetch_list)
 
                 result = []
                 run_steps = self.attrs['steps'] if save_otherwise_load \
@@ -111,10 +108,9 @@ def _test_base(self, save_otherwise_load):
                 for i in range(run_steps):
                     tmp = exe.run(program, feed=feed, fetch_list=fetch_list)
 
-                    # currently, we update opt state every sess.run,
-                    # will optimize
                     if save_otherwise_load and \
                         i == self.attrs['save_at_step'] - 1:
+                        ipu_program._backend.weights_to_host()
                         paddle.static.save(main_prog,
                                            self.attrs['model_path'].name)
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
index 9a18922f35331..6702ae4344e91 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
@@ -88,11 +88,10 @@ def _test_base(self, exec_mode):
             if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
+                ipu_strategy.set_graph_config(
+                    is_training=self.is_training, micro_batch_size=2)
                 if exec_mode == ExecutionMode.IPU_POPART_FP16:
                     ipu_strategy.set_precision_config(enable_fp16=True)
-                # set batch size
-                ipu_strategy.micro_batch_size = 2
                 program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py
index 66c547de2c280..2e84607e2f5c2 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py
@@ -25,17 +25,120 @@
 import hypothesis.strategies as st
 
 
-class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest):
+# the two inputs of elementwise_add are tensor
+class TestConvElementwiseAddMkldnnFusePass1(PassAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
             program_config.ops[i].attrs
             for i in range(len(program_config.ops))
         ]
-        # If the problem has been fixed, the judgment 
-        # needs to be deleted!!!
-        if attrs[1]['data_format'] == "NHWC":
+        if attrs[1]['data_format'] == "NHWC" and attrs[3]['axis'] == 0:
+            return False
+        if attrs[1]['data_format'] == "NCHW" and attrs[3]['axis'] == -1:
             return False
+        return True
+
+    def sample_program_config(self, draw):
+        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
+        dilations = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]]))
+        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
+        groups = draw(st.sampled_from([1, 2, 4]))
+        paddings = draw(st.sampled_from([[0, 3], [1, 1], [1, 2, 3, 4]]))
+        strides = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]]))
+        axis = draw(st.sampled_from([-1, 0]))
+        batch_size = draw(st.integers(min_value=1, max_value=4))
+
+        def generate_input():
+            if data_format == "NCHW":
+                return np.random.random(
+                    [batch_size, 48, 64, 64]).astype(np.float32)
+            else:
+                return np.random.random(
+                    [batch_size, 64, 64, 48]).astype(np.float32)
+
+        def generate_weight():
+            return np.random.random(
+                [48, int(48 / groups), 3, 3]).astype(np.float32)
+
+        relu_op = OpConfig(
+            type="relu",
+            inputs={"X": ["input_data"]},
+            outputs={"Out": ["relu_out"]},
+            attrs={})
+
+        conv2d_op1 = OpConfig(
+            type="conv2d",
+            inputs={"Input": ["relu_out"],
+                    "Filter": ["conv_weight1"]},
+            outputs={"Output": ["conv_output1"]},
+            attrs={
+                "data_format": data_format,
+                "dilations": dilations,
+                "padding_algorithm": padding_algorithm,
+                "groups": groups,
+                "paddings": paddings,
+                "strides": strides
+            })
+
+        conv2d_op2 = OpConfig(
+            type="conv2d",
+            inputs={"Input": ["input_data"],
+                    "Filter": ["conv_weight2"]},
+            outputs={"Output": ["conv_output2"]},
+            attrs={
+                "data_format": data_format,
+                "dilations": dilations,
+                "padding_algorithm": padding_algorithm,
+                "groups": groups,
+                "paddings": paddings,
+                "strides": strides
+            })
+
+        elt_op = OpConfig(
+            type="elementwise_add",
+            inputs={"X": ["conv_output1"],
+                    "Y": ["conv_output2"]},
+            outputs={"Out": ["elementwise_output"]},
+            attrs={'axis': axis})
 
+        model_net = [relu_op, conv2d_op1, conv2d_op2, elt_op]
+
+        program_config = ProgramConfig(
+            ops=model_net,
+            weights={
+                "conv_weight1": TensorConfig(data_gen=partial(generate_weight)),
+                "conv_weight2": TensorConfig(data_gen=partial(generate_weight))
+            },
+            inputs={
+                "input_data": TensorConfig(data_gen=partial(generate_input))
+            },
+            outputs=["elementwise_output"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, ["relu", "conv2d", "conv2d"], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False, passes=["conv_elementwise_add_mkldnn_fuse_pass"])
+
+
+'''
+class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+        if "elementwise_weight" in program_config.weights:
+            if program_config.weights["elementwise_weight"].shape[0] == program_config.inputs["input_data1"].shape[1]:
+                if attrs[2]['axis'] != 1:
+                    return False
+            if program_config.weights["elementwise_weight"].shape[0] == program_config.inputs["input_data1"].shape[3]:
+                if attrs[2]['axis'] != -1:
+                    return False
         return True
 
     def sample_program_config(self, draw):
@@ -101,7 +204,7 @@ def generate_weight2():
                 "strides": strides
             })
 
-        if axis == -1 or axis == 0:
+        if axis == 0:
             elt_op = OpConfig(
                 type="elementwise_add",
                 inputs={"X": ["input_data1"],
@@ -118,14 +221,12 @@ def generate_weight2():
 
         model_net = [relu_op, conv2d_op, elt_op]
 
-        if axis == 1:
+        if axis == 0:
             program_config = ProgramConfig(
                 ops=model_net,
                 weights={
                     "conv_weight":
-                    TensorConfig(data_gen=partial(generate_weight1)),
-                    "elementwise_weight":
-                    TensorConfig(data_gen=partial(generate_weight2))
+                    TensorConfig(data_gen=partial(generate_weight1))
                 },
                 inputs={
                     "input_data1":
@@ -137,7 +238,9 @@ def generate_weight2():
                 ops=model_net,
                 weights={
                     "conv_weight":
-                    TensorConfig(data_gen=partial(generate_weight1))
+                    TensorConfig(data_gen=partial(generate_weight1)),
+                    "elementwise_weight":
+                    TensorConfig(data_gen=partial(generate_weight2))
                 },
                 inputs={
                     "input_data1":
@@ -154,7 +257,7 @@ def sample_predictor_configs(self, program_config):
     def test(self):
         self.run_and_statis(
             quant=False, passes=["conv_elementwise_add_mkldnn_fuse_pass"])
-
+'''
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
index 33df428388882..81bb182802ede 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
@@ -19,6 +19,7 @@
 from functools import partial
 from typing import Optional, List, Callable, Dict, Any, Set
 import unittest
+import paddle
 
 import hypothesis
 from hypothesis import given, settings, seed, example, assume
@@ -104,4 +105,5 @@ def test(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
new file mode 100644
index 0000000000000..893bd3833430c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
@@ -0,0 +1,328 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.core import PassVersionChecker
+
+
+class ElementwiseActivationMkldnnFusePassTest(InferencePassTest):
+    act_alpha = None
+    act_beta = None
+    pass_name = 'elt_act_mkldnn_fuse_pass'
+
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data_A = fluid.data(
+                name="data_A", shape=[-1, 3, 100, 100], dtype="float32")
+            data_B = fluid.data(
+                name="data_B", shape=[-1, 3, 100, 100], dtype="float32")
+            elt_out = self.operand(data_A, data_B)
+            if self.act is not None:
+                if self.act_beta is not None:
+                    elt_out = self.act(elt_out, self.act_alpha, self.act_beta)
+                elif self.act_alpha is not None:
+                    elt_out = self.act(elt_out, self.act_alpha)
+                else:
+                    elt_out = self.act(elt_out)
+
+        self.feeds = {
+            "data_A": np.random.random((1, 3, 100, 100)).astype("float32"),
+            "data_B": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [elt_out]
+        self.enable_mkldnn = True
+
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = None
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+    def test_pass_compatible(self):
+        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Relu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.relu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Tanh(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.tanh
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_LeakyRelu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act_alpha = 0.2
+        self.act = fluid.layers.leaky_relu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Swish(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act_alpha = 4
+        self.act = fluid.layers.swish
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_HardSwish(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.hard_swish
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_SQRT(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.sqrt
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_ABS(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.abs
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Clip(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.clip
+        self.act_alpha = 0.0
+        self.act_beta = 10.0
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Gelu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.gelu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Gelu_Tanh(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.gelu
+        self.act_alpha = True
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Relu6(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.relu6
+        self.act_alpha = 5.0
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Sigmoid(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.sigmoid
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Relu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.relu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Tanh(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.tanh
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_LeakyRelu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act_alpha = 0.2
+        self.act = fluid.layers.leaky_relu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Swish(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.swish
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_HardSwish(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.hard_swish
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_ABS(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.abs
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Clip(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.clip
+        self.act_alpha = 0.0
+        self.act_beta = 10.0
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Gelu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.gelu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Gelu_Tanh(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.gelu
+        self.act_alpha = True
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Relu6(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.relu6
+        self.act_alpha = 5.0
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Sigmoid(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.sigmoid
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Relu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.relu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Tanh(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.tanh
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_LeakyRelu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act_alpha = 0.2
+        self.act = fluid.layers.leaky_relu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Swish(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.swish
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_HardSwish(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.hard_swish
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_SQRT(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.sqrt
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_ABS(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.abs
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Clip(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.clip
+        self.act_alpha = 0.0
+        self.act_beta = 10.0
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Gelu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.gelu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Gelu_Tanh(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.gelu
+        self.act_alpha = True
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Relu6(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.relu6
+        self.act_alpha = 5.0
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Sigmoid(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.sigmoid
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass_new.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass_new.py
new file mode 100644
index 0000000000000..0f5279b0edadd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass_new.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
+
+class TestElementWiseAddReluFusePass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_config(self, draw):
+        batch_size = draw(st.integers(min_value=1, max_value=4))
+
+        def generate_input():
+            return np.random.random(
+                [batch_size, 3, 100, 100]).astype(np.float32)
+
+        ops_config = [{
+            "op_type": "elementwise_add",
+            "op_inputs": {
+                "X": ["A"],
+                "Y": ["B"]
+            },
+            "op_outputs": {
+                "Out": ["add_output"]
+            },
+            "op_attrs": {}
+        }, {
+            "op_type": "relu",
+            "op_inputs": {
+                "X": ["add_output"]
+            },
+            "op_outputs": {
+                "Out": ["relu_output"]
+            },
+            "op_attrs": {}
+        }]
+
+        ops = self.generate_op_config(ops_config)
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={},
+            inputs={
+                "A": TensorConfig(data_gen=partial(generate_input)),
+                "B": TensorConfig(data_gen=partial(generate_input))
+            },
+            outputs=["relu_output"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, ["elementwise_add"], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False, passes=["elt_act_mkldnn_fuse_pass"], min_success_num=4)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
index 17f5509bdb958..340378225261f 100644
--- a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
@@ -22,8 +22,10 @@ if (WITH_MLU)
         bash_test_modules(test_c_comm_init_op_mlu START_BASH test_c_comm_init_op_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         set_tests_properties(test_collective_broadcast PROPERTIES TIMEOUT 120)
         set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120)
+	set_tests_properties(test_collective_allgather PROPERTIES TIMEOUT 120)
         set_tests_properties(test_collective_broadcast_api_mlu PROPERTIES TIMEOUT 120)
         set_tests_properties(test_collective_allreduce_api_mlu PROPERTIES TIMEOUT 120)
+	set_tests_properties(test_collective_allgather_api_mlu PROPERTIES TIMEOUT 120)
         set_tests_properties(test_c_comm_init_op_mlu PROPERTIES TIMEOUT 120)
     endif(WITH_CNCL)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allgather_api.py b/python/paddle/fluid/tests/unittests/mlu/collective_allgather_api.py
new file mode 100755
index 0000000000000..50ae6b1a169d7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_allgather_api.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base_mlu import TestCollectiveAPIRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllgatherAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tensor_list = []
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            paddle.distributed.all_gather(tensor_list, tindata)
+            return tensor_list
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllgatherAPI, "allgather")
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allgather_op.py b/python/paddle/fluid/tests/unittests/mlu/collective_allgather_op.py
new file mode 100755
index 0000000000000..1058514f9ca24
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_allgather_op.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base_mlu import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllgather(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        nranks = 2
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofgather",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_allgather",
+                inputs={'X': tindata},
+                attrs={'ring_id': ring_id,
+                       'nranks': nranks},
+                outputs={'Out': toutdata})
+            main_prog.global_block().append_op(
+                type="c_sync_comm_stream",
+                inputs={'X': toutdata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllgather, "allgather", 0)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py
new file mode 100644
index 0000000000000..57fa56acd6875
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py
@@ -0,0 +1,145 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+
+paddle.enable_static()
+SEED = 2022
+
+
+class TestCheckFiniteAndUnscaleOp(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "check_finite_and_unscale"
+        self.init_dtype()
+        self.init_test_case()
+
+    def init_test_case(self):
+        x = np.random.random((129, 129)).astype(self.dtype)
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([0]),
+            'Out': [('out0', x / scale)],
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestCheckFiniteAndUnscaleOpWithNan(TestCheckFiniteAndUnscaleOp):
+    def init_test_case(self):
+        x = np.random.random((129, 129)).astype(self.dtype)
+        x[128][128] = np.nan
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([1]),
+            'Out': [('out0', x)],
+        }
+
+    def test_check_output(self):
+        # When input contains nan, do not check the output, 
+        # since the output may be nondeterministic and will be discarded.
+        self.check_output_with_place(self.place, no_check_set=['Out'])
+
+
+class TestCheckFiniteAndUnscaleOpWithInf(TestCheckFiniteAndUnscaleOp):
+    def init_test_case(self):
+        x = np.random.random((129, 129)).astype(self.dtype)
+        x[128][128] = np.inf
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([1]),
+            'Out': [('out0', x)],
+        }
+
+    def test_check_output(self):
+        # When input contains inf, do not check the output, 
+        # since the output may be nondeterministic and will be discarded.
+        self.check_output_with_place(self.place, no_check_set=['Out'])
+
+
+class TestCheckFiniteAndUnscaleOpMultiInput(TestCheckFiniteAndUnscaleOp):
+    def init_test_case(self):
+        x0 = np.random.random((129, 129)).astype(self.dtype)
+        x1 = np.random.random((129, 129)).astype(self.dtype)
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x0), ('x1', x1)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([0]),
+            'Out': [('out0', x0 / scale), ('out1', x1 / scale)],
+        }
+
+
+class TestCheckFiniteAndUnscaleOpMultiInputWithNan(TestCheckFiniteAndUnscaleOp):
+    def init_test_case(self):
+        x0 = np.random.random((129, 129)).astype(self.dtype)
+        x0[128][128] = np.nan
+        x1 = np.random.random((129, 129)).astype(self.dtype)
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x0), ('x1', x1)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([1]),
+            'Out': [('out0', x0 / scale), ('out1', x1 / scale)],
+        }
+
+    def test_check_output(self):
+        # When input contains inf, do not check the output, 
+        # since the output may be nondeterministic and will be discarded.
+        self.check_output_with_place(self.place, no_check_set=['Out'])
+
+
+class TestCheckFiniteAndUnscaleOpMultiInputWithInf(TestCheckFiniteAndUnscaleOp):
+    def init_test_case(self):
+        x0 = np.random.random((129, 129)).astype(self.dtype)
+        x0[128][128] = np.nan
+        x1 = np.random.random((129, 129)).astype(self.dtype)
+        x1[128][128] = np.inf
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x0), ('x1', x1)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([1]),
+            'Out': [('out0', x0 / scale), ('out1', x1 / scale)],
+        }
+
+    def test_check_output(self):
+        # When input contains inf, do not check the output, 
+        # since the output may be nondeterministic and will be discarded.
+        self.check_output_with_place(self.place, no_check_set=['Out'])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather.py
new file mode 100644
index 0000000000000..09166e15aac81
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather.py
@@ -0,0 +1,55 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_base_mlu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCAllgatherOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allgather_fp32(self):
+        self.check_with_place("collective_allgather_op.py", "allgather",
+                              "float32")
+
+    def test_allgather_fp16(self):
+        self.check_with_place("collective_allgather_op.py", "allgather",
+                              "float16")
+
+    def test_allgather_int32(self):
+        self.check_with_place("collective_allgather_op.py", "allgather",
+                              "int32")
+
+    def test_allgather_int16(self):
+        self.check_with_place("collective_allgather_op.py", "allgather",
+                              "int16")
+
+    def test_allgather_int8(self):
+        self.check_with_place("collective_allgather_op.py", "allgather", "int8")
+
+    def test_allgather_uint8(self):
+        self.check_with_place("collective_allgather_op.py", "allgather",
+                              "uint8")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather_api_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather_api_mlu.py
new file mode 100755
index 0000000000000..576c310cc3ac2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather_api_mlu.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_api_base_mlu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCollectiveAllgatherAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allgather_cncl_fp16(self):
+        self.check_with_place("collective_allgather_api.py", "allgather",
+                              "float16")
+
+    def test_allgather_cncl_fp32(self):
+        self.check_with_place("collective_allgather_api.py", "allgather",
+                              "float32")
+
+    def test_allgather_cncl_int32(self):
+        self.check_with_place("collective_allgather_api.py", "allgather",
+                              "int32")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
index 556fc6fcbb75f..3c1cf7d2d1b2b 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
@@ -219,5 +219,11 @@ def check_with_place(self,
             self.assertTrue(
                 np.allclose(
                     tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        elif col_type == "allgather":
+            need_result = np.vstack((input1, input2))
+            tr_out0 = np.vstack((tr0_out[0], tr0_out[1]))
+            tr_out1 = np.vstack((tr1_out[0], tr1_out[1]))
+            self.assertTrue(np.allclose(tr_out0, need_result))
+            self.assertTrue(np.allclose(tr_out1, need_result))
         else:
             pass
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
index 4692c893d00b4..9c2e2205eb876 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
@@ -270,5 +270,9 @@ def check_with_place(self,
             self.assertTrue(
                 np.allclose(
                     tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        elif col_type == "allgather":
+            need_result = np.vstack((input1, input2))
+            self.assertTrue(np.allclose(tr0_out, need_result))
+            self.assertTrue(np.allclose(tr1_out, need_result))
         else:
             pass
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py
new file mode 100644
index 0000000000000..f3699da15b535
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py
@@ -0,0 +1,373 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('..')
+import unittest
+import paddle
+import numpy as np
+from paddle.fluid.layer_helper import LayerHelper
+from collections import OrderedDict
+
+
+def run_momentum_op(params,
+                    grads,
+                    velocitys,
+                    master_params,
+                    learning_rate,
+                    place,
+                    multi_precision,
+                    mu=0.9,
+                    rescale_grad=0.01,
+                    use_merged=False):
+    assert len(params) == len(grads)
+    assert len(params) == len(velocitys)
+    if multi_precision:
+        assert len(params) == len(master_params)
+    op_type = 'merged_momentum' if use_merged else 'momentum'
+    main = paddle.static.Program()
+    startup = paddle.static.Program()
+    with paddle.static.program_guard(main, startup):
+        helper = LayerHelper(op_type, **locals())
+        attrs = {
+            'mu': mu,
+            'multi_precision': multi_precision,
+            'rescale_grad': rescale_grad,
+        }
+
+        param_vars = [
+            helper.create_variable(
+                persistable=True, shape=p.shape, dtype=p.dtype) for p in params
+        ]
+        grad_vars = [
+            helper.create_variable(
+                shape=g.shape, dtype=g.dtype) for g in grads
+        ]
+        velocity_vars = [
+            helper.create_variable(
+                persistable=True, shape=v.shape, dtype=v.dtype)
+            for v in velocitys
+        ]
+        lr_var = helper.create_variable(
+            persistable=True,
+            shape=learning_rate.shape,
+            dtype=learning_rate.dtype)
+
+        feed_dict = OrderedDict()
+
+        feed_dict.update(
+            OrderedDict([(p_var.name, p_val)
+                         for p_var, p_val in zip(param_vars, params)]))
+        feed_dict.update(
+            OrderedDict([(v_var.name, v_val)
+                         for v_var, v_val in zip(velocity_vars, velocitys)]))
+        fetch_list = list(feed_dict.keys())
+
+        feed_dict.update(
+            OrderedDict([(g_var.name, g_val)
+                         for g_var, g_val in zip(grad_vars, grads)]))
+        feed_dict.update({lr_var.name: learning_rate})
+
+        if multi_precision:
+            master_param_vars = [
+                helper.create_variable(
+                    persistable=True, shape=p.shape, dtype=p.dtype)
+                for p in master_params
+            ]
+            feed_dict.update(
+                OrderedDict([(mp_var.name, mp_val)
+                             for mp_var, mp_val in zip(master_param_vars,
+                                                       master_params)]))
+            # CPUPlace does not use MasterParam
+            if isinstance(place, paddle.CUDAPlace):
+                fetch_list = fetch_list + [
+                    mp_var.name for mp_var in master_param_vars
+                ]
+        else:
+            master_param_vars = None
+
+        if not use_merged:
+            for i, (p, g,
+                    v) in enumerate(zip(param_vars, grad_vars, velocity_vars)):
+                inputs = {
+                    'Param': p,
+                    'Grad': g,
+                    'Velocity': v,
+                    'LearningRate': lr_var,
+                }
+                outputs = {'ParamOut': p, 'VelocityOut': v}
+                if multi_precision:
+                    inputs['MasterParam'] = master_param_vars[i]
+                    outputs['MasterParamOut'] = master_param_vars[i]
+                helper.append_op(
+                    type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        else:
+            inputs = {
+                'Param': param_vars,
+                'Grad': grad_vars,
+                'Velocity': velocity_vars,
+                'LearningRate': lr_var,
+            }
+            outputs = {'ParamOut': param_vars, 'VelocityOut': velocity_vars}
+            if multi_precision:
+                inputs['MasterParam'] = master_param_vars
+                outputs['MasterParamOut'] = master_param_vars
+            helper.append_op(
+                type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+
+    exe = paddle.static.Executor(place)
+    with paddle.static.scope_guard(paddle.static.Scope()):
+        exe.run(startup)
+        return exe.run(main, feed=feed_dict, fetch_list=fetch_list)
+
+
+def run_momentum_op2(params,
+                     grads,
+                     velocitys,
+                     master_params,
+                     learning_rate,
+                     place,
+                     multi_precision,
+                     mu=0.9,
+                     rescale_grad=0.01,
+                     use_merged=False,
+                     use_nesterov=True):
+    assert len(params) == len(grads)
+    assert len(params) == len(velocitys)
+    if multi_precision:
+        assert len(params) == len(master_params)
+    op_type = 'merged_momentum' if use_merged else 'momentum'
+    main = paddle.static.Program()
+    startup = paddle.static.Program()
+    with paddle.static.program_guard(main, startup):
+        helper = LayerHelper(op_type, **locals())
+
+        param_vars = [
+            helper.create_variable(
+                persistable=True, shape=p.shape, dtype=p.dtype) for p in params
+        ]
+        grad_vars = [
+            helper.create_variable(
+                shape=g.shape, dtype=g.dtype) for g in grads
+        ]
+        velocity_vars = [
+            helper.create_variable(
+                persistable=True, shape=v.shape, dtype=v.dtype)
+            for v in velocitys
+        ]
+        lr_var = helper.create_variable(
+            persistable=True,
+            shape=learning_rate.shape,
+            dtype=learning_rate.dtype)
+
+        feed_dict = OrderedDict()
+
+        feed_dict.update(
+            OrderedDict([(p_var.name, p_val)
+                         for p_var, p_val in zip(param_vars, params)]))
+        feed_dict.update(
+            OrderedDict([(v_var.name, v_val)
+                         for v_var, v_val in zip(velocity_vars, velocitys)]))
+        fetch_list = list(feed_dict.keys())
+
+        feed_dict.update(
+            OrderedDict([(g_var.name, g_val)
+                         for g_var, g_val in zip(grad_vars, grads)]))
+        feed_dict.update({lr_var.name: learning_rate})
+
+        if multi_precision:
+            master_param_vars = [
+                helper.create_variable(
+                    persistable=True, shape=p.shape, dtype=p.dtype)
+                for p in master_params
+            ]
+            feed_dict.update(
+                OrderedDict([(mp_var.name, mp_val)
+                             for mp_var, mp_val in zip(master_param_vars,
+                                                       master_params)]))
+            # CPUPlace does not use MasterParam
+            if isinstance(place, paddle.CUDAPlace):
+                fetch_list = fetch_list + [
+                    mp_var.name for mp_var in master_param_vars
+                ]
+        else:
+            master_param_vars = None
+
+        if not use_merged:
+            for i, (p, g,
+                    v) in enumerate(zip(param_vars, grad_vars, velocity_vars)):
+                inputs = {
+                    'Param': p,
+                    'Grad': g,
+                    'Velocity': v,
+                    'LearningRate': lr_var,
+                }
+                outputs = {'ParamOut': p, 'VelocityOut': v}
+                if multi_precision:
+                    inputs['MasterParam'] = master_param_vars[i]
+                    outputs['MasterParamOut'] = master_param_vars[i]
+                attrs = {
+                    'mu': mu,
+                    'multi_precision': multi_precision,
+                    'rescale_grad': rescale_grad,
+                    'use_nesterov': use_nesterov,
+                    'regularization_method': 'l2_decay',
+                    'regularization_coeff': 2.0,
+                }
+                helper.append_op(
+                    type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        else:
+            inputs = {
+                'Param': param_vars,
+                'Grad': grad_vars,
+                'Velocity': velocity_vars,
+                'LearningRate': lr_var,
+            }
+            outputs = {'ParamOut': param_vars, 'VelocityOut': velocity_vars}
+            if multi_precision:
+                inputs['MasterParam'] = master_param_vars
+                outputs['MasterParamOut'] = master_param_vars
+            attrs = {
+                'mu': mu,
+                'multi_precision': multi_precision,
+                'rescale_grad': rescale_grad,
+                'use_nesterov': use_nesterov,
+                'regularization_method':
+                ['l2_decay' for i in range(len(param_vars))],
+                'regularization_coeff': [2.0 for i in range(len(param_vars))],
+            }
+            helper.append_op(
+                type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+
+    exe = paddle.static.Executor(place)
+    with paddle.static.scope_guard(paddle.static.Scope()):
+        exe.run(startup)
+        return exe.run(main, feed=feed_dict, fetch_list=fetch_list)
+
+
+class TestMergedMomentum(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
+        self.seed = 10
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def gen_rand_data(self, shapes, dtype):
+        return [np.random.random(s).astype(dtype) for s in shapes]
+
+    def prepare_data(self, shapes, multi_precision, seed, place):
+        np.random.seed(seed)
+        mp_dtype = np.float32
+        dtype = np.float32
+        params = self.gen_rand_data(shapes, dtype)
+        grads = self.gen_rand_data(shapes, dtype)
+        velocitys = self.gen_rand_data(shapes, mp_dtype)
+        learning_rate = self.gen_rand_data([[1]], mp_dtype)[0]
+        if multi_precision:
+            master_params = [p.astype(mp_dtype) for p in params]
+        else:
+            master_params = None
+        return params, grads, velocitys, master_params, learning_rate
+
+    def check_with_place(self, place, multi_precision):
+        params, grads, velocitys, master_params, learning_rate = self.prepare_data(
+            self.shapes, multi_precision, self.seed, place)
+
+        def run_op(use_merged):
+            # MLU Momentum Op does not support rescale_grad 
+            rescale_grad = 1.0
+            return run_momentum_op(
+                params,
+                grads,
+                velocitys,
+                master_params,
+                learning_rate,
+                place,
+                multi_precision,
+                rescale_grad=rescale_grad,
+                use_merged=use_merged)
+
+        outs1 = run_op(True)
+        outs2 = run_op(False)
+        self.assertEqual(len(outs1), len(outs2))
+        for i, (out1, out2) in enumerate(zip(outs1, outs2)):
+            self.assertTrue(np.allclose(out1, out2, atol=1e-7))
+
+    def test_main(self):
+        self.check_with_place(self.place, multi_precision=False)
+
+
+class TestMergedMomentum2(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
+        self.seed = 10
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def gen_rand_data(self, shapes, dtype):
+        return [np.random.random(s).astype(dtype) for s in shapes]
+
+    def prepare_data(self, shapes, multi_precision, seed, place):
+        np.random.seed(seed)
+        mp_dtype = np.float32
+        dtype = np.float32  # np.float16
+        params = self.gen_rand_data(shapes, dtype)
+        grads = self.gen_rand_data(shapes, dtype)
+        velocitys = self.gen_rand_data(shapes, mp_dtype)
+        learning_rate = self.gen_rand_data([[1]], mp_dtype)[0]
+        if multi_precision:
+            master_params = [p.astype(mp_dtype) for p in params]
+        else:
+            master_params = None
+        return params, grads, velocitys, master_params, learning_rate
+
+    def check_with_place(self, place, multi_precision):
+        params, grads, velocitys, master_params, learning_rate = self.prepare_data(
+            self.shapes, multi_precision, self.seed, place)
+
+        def run_op(use_nesterov, use_merged):
+            # MLU Momentum Op does not support rescale_grad 
+            rescale_grad = 1.0
+            return run_momentum_op2(
+                params,
+                grads,
+                velocitys,
+                master_params,
+                learning_rate,
+                place,
+                multi_precision,
+                rescale_grad=rescale_grad,
+                use_merged=use_merged,
+                use_nesterov=use_nesterov)
+
+        outs1 = run_op(use_nesterov=True, use_merged=True)
+        outs2 = run_op(use_nesterov=True, use_merged=False)
+        self.assertEqual(len(outs1), len(outs2))
+        for i, (out1, out2) in enumerate(zip(outs1, outs2)):
+            self.assertTrue(np.allclose(out1, out2, atol=1e-7))
+
+        outs3 = run_op(use_nesterov=False, use_merged=True)
+        outs4 = run_op(use_nesterov=False, use_merged=False)
+        self.assertEqual(len(outs3), len(outs4))
+        for j, (out3, out4) in enumerate(zip(outs3, outs4)):
+            self.assertTrue(np.allclose(out3, out4, atol=1e-7))
+
+    def test_main(self):
+        self.check_with_place(self.place, multi_precision=False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
index 8e31d58195be8..e9d9af5c11366 100644
--- a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
@@ -22,4 +22,5 @@ if (WITH_ASCEND_CL)
     set_tests_properties(test_conv2d_transpose_op_npu PROPERTIES TIMEOUT 200)
     set_tests_properties(test_conv2d_op_npu PROPERTIES TIMEOUT 300)
     set_tests_properties(test_matmulv2_op_npu PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_elementwise_add_op_npu PROPERTIES TIMEOUT 200)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
index 877f9904f3407..e01b2b691a28a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
@@ -144,6 +144,7 @@ def set_npu(self):
 
     def setUp(self):
         self.set_npu()
+        self.init_dtype()
         self.use_mkldnn = False
         self.fuse_with_relu = False
         self.data_formats = ["NCHW", "NHWC"]
@@ -153,6 +154,9 @@ def setUp(self):
         self.init_kernel_type()
         self.init_test_case()
 
+    def init_dtype(self):
+        self.dtype = np.float32
+
     def init_test_case(self):
         self.use_global_stats = False
         self.no_grad_set = set()
@@ -210,11 +214,16 @@ def test_with_place(place, data_layout, shape):
             scale_shape = [c]
 
             np.random.seed(123)
-            x = np.random.random_sample(shape).astype(np.float32)
+            x = np.random.random_sample(shape).astype(self.dtype)
             scale = np.random.random_sample(scale_shape).astype(np.float32)
             bias = np.random.random_sample(scale_shape).astype(np.float32)
             mean, variance = self.set_mean_variance(scale_shape, x, data_layout)
-            y_grad = np.random.random_sample(shape).astype(np.float32)
+
+            if self.dtype == np.float16:
+                mean = mean.astype(np.float32)
+                variance = variance.astype(np.float32)
+
+            y_grad = np.random.random_sample(shape).astype(self.dtype)
             momentum_var = np.array([momentum]).astype(np.float32)
 
             y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward(
@@ -275,7 +284,7 @@ def test_with_place(place, data_layout, shape):
                     inputs=inputs,
                     outputs=outputs,
                     attrs=attrs)
-                block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
+                block.create_var(name='y@GRAD', dtype=self.dtype, shape=y.shape)
 
                 # generate backward op_desc
                 grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
@@ -320,6 +329,11 @@ def init_kernel_type(self):
         pass
 
 
+class TestFP16BatchNormOpTraining(TestBatchNormOpTraining):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
 class TestBatchNormOpTrainingCase1(TestBatchNormOpTraining):
     def init_test_case(self):
         self.use_global_stats = False
diff --git a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py
index 012a6e59e775f..2e15a1eac2b4b 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py
@@ -132,36 +132,50 @@ def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-2)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         if self.dilations[0] == 1 and self.dilations[1] == 1:
-            self.check_grad_with_place(
-                self.place, {'Input', 'Filter'},
-                'Output',
-                max_relative_error=0.03,
-                numeric_place=paddle.CPUPlace())
+            if self.dtype == np.float16:
+                self.check_grad_with_place(
+                    self.place, {'Input', 'Filter'},
+                    'Output',
+                    max_relative_error=0.9)
+            else:
+                self.check_grad_with_place(
+                    self.place, {'Input', 'Filter'},
+                    'Output',
+                    max_relative_error=0.03,
+                    numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            self.place, ['Input'],
-            'Output',
-            no_grad_set=set(['Filter']),
-            max_relative_error=0.03,
-            numeric_place=paddle.CPUPlace())
-
-    def test_check_grad_no_input(self):
-        if self.dtype == np.float16:
-            return
-        if self.dilations[0] == 1 and self.dilations[1] == 1:
             self.check_grad_with_place(
-                self.place, ['Filter'],
+                self.place, ['Input'],
                 'Output',
-                no_grad_set=set(['Input']),
+                no_grad_set=set(['Filter']),
+                max_relative_error=0.9)
+        else:
+            self.check_grad_with_place(
+                self.place, ['Input'],
+                'Output',
+                no_grad_set=set(['Filter']),
                 max_relative_error=0.03,
                 numeric_place=paddle.CPUPlace())
 
+    def test_check_grad_no_input(self):
+        if self.dilations[0] == 1 and self.dilations[1] == 1:
+            if self.dtype == np.float16:
+                self.check_grad_with_place(
+                    self.place, ['Filter'],
+                    'Output',
+                    no_grad_set=set(['Input']),
+                    max_relative_error=0.9)
+            else:
+                self.check_grad_with_place(
+                    self.place, ['Filter'],
+                    'Output',
+                    no_grad_set=set(['Input']),
+                    max_relative_error=0.03,
+                    numeric_place=paddle.CPUPlace())
+
     def init_data_format(self):
         self.data_format = "NCHW"
 
@@ -267,32 +281,46 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            self.place, {'Input', 'Filter'},
-            'Output',
-            max_relative_error=0.03,
-            numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(
+                self.place, {'Input', 'Filter'},
+                'Output',
+                max_relative_error=1.2)
+        else:
+            self.check_grad_with_place(
+                self.place, {'Input', 'Filter'},
+                'Output',
+                max_relative_error=0.03,
+                numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            self.place, ['Input'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Filter']),
-            numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(
+                self.place, ['Input'],
+                'Output',
+                max_relative_error=0.7,
+                no_grad_set=set(['Filter']))
+        else:
+            self.check_grad_with_place(
+                self.place, ['Input'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Filter']),
+                numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            self.place, ['Filter'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Input']),
-            numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(
+                self.place, ['Filter'],
+                'Output',
+                max_relative_error=0.8,
+                no_grad_set=set(['Input']))
+        else:
+            self.check_grad_with_place(
+                self.place, ['Filter'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Input']),
+                numeric_place=paddle.CPUPlace())
 
     def init_data_format(self):
         self.data_format = "NCHW"
diff --git a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py
index d0dc86055a163..4070d0267d95b 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py
@@ -127,8 +127,6 @@ def test_check_output(self):
         self.check_output_with_place(fluid.NPUPlace(0), atol=1e-2)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             fluid.NPUPlace(0), {'Input', 'Filter'},
             'Output',
@@ -136,8 +134,6 @@ def test_check_grad(self):
             numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             fluid.NPUPlace(0), ['Input'],
             'Output',
@@ -146,8 +142,6 @@ def test_check_grad_no_filter(self):
             numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_input(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             fluid.NPUPlace(0), ['Filter'],
             'Output',
@@ -276,10 +270,13 @@ class TestConv2DOp_v2(OpTest):
     def set_npu(self):
         self.__class__.use_npu = True
 
+    def init_dtype(self):
+        self.dtype = np.float32
+
     def setUp(self):
         self.set_npu()
         self.op_type = "conv2d"
-        self.dtype = np.float32
+        self.init_dtype()
         self.init_kernel_type()
         self.init_group()
         self.init_dilation()
@@ -320,31 +317,45 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            paddle.NPUPlace(0), {'Input', 'Filter'},
-            'Output',
-            max_relative_error=0.02,
-            numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(
+                paddle.NPUPlace(0), {'Input', 'Filter'},
+                'Output',
+                max_relative_error=1.1)
+        else:
+            self.check_grad_with_place(
+                paddle.NPUPlace(0), {'Input', 'Filter'},
+                'Output',
+                max_relative_error=0.02,
+                numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            paddle.NPUPlace(0), ['Input'],
-            'Output',
-            max_relative_error=0.02,
-            no_grad_set=set(['Filter']),
-            numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(
+                paddle.NPUPlace(0), ['Input'],
+                'Output',
+                max_relative_error=0.99,
+                no_grad_set=set(['Filter']))
+        else:
+            self.check_grad_with_place(
+                paddle.NPUPlace(0), ['Input'],
+                'Output',
+                max_relative_error=0.02,
+                no_grad_set=set(['Filter']),
+                numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            paddle.NPUPlace(0), ['Filter'],
-            'Output',
-            no_grad_set=set(['Input']),
-            numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(
+                paddle.NPUPlace(0), ['Filter'],
+                'Output',
+                max_relative_error=0.99,
+                no_grad_set=set(['Input']))
+        else:
+            self.check_grad_with_place(
+                paddle.NPUPlace(0), ['Filter'],
+                'Output',
+                no_grad_set=set(['Input']),
+                numeric_place=paddle.CPUPlace())
 
     def init_test_case(self):
         self.pad = [0, 0]
diff --git a/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py
index 9b29fc812faed..a4769442b083e 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py
@@ -51,8 +51,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-7)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py
index bd9022f56a3e7..fea8502f2d766 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py
@@ -56,8 +56,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
index 75c70e0a131ac..f24c6c455a0cb 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
@@ -65,36 +65,59 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16 or self.dtype == np.int64:
+        if self.dtype == np.int64:
             return
 
-        self.check_grad_with_place(
-            self.place,
-            ['X', 'Y'],
-            'Out',
-            max_relative_error=0.006, )
+        if self.dtype == np.float16:
+            self.check_grad_with_place(
+                self.place,
+                ['X', 'Y'],
+                'Out',
+                max_relative_error=0.15, )
+        else:
+            self.check_grad_with_place(
+                self.place,
+                ['X', 'Y'],
+                'Out',
+                max_relative_error=0.006, )
 
     def test_check_grad_ingore_x(self):
-        if self.dtype == np.float16 or self.dtype == np.int64:
+        if self.dtype == np.int64:
             return
 
-        self.check_grad_with_place(
-            self.place,
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            max_relative_error=0.006, )
+        if self.dtype == np.float16:
+            self.check_grad_with_place(
+                self.place,
+                ['Y'],
+                'Out',
+                no_grad_set=set("X"),
+                max_relative_error=0.92, )
+        else:
+            self.check_grad_with_place(
+                self.place,
+                ['Y'],
+                'Out',
+                no_grad_set=set("X"),
+                max_relative_error=0.006, )
 
     def test_check_grad_ingore_y(self):
-        if self.dtype == np.float16 or self.dtype == np.int64:
+        if self.dtype == np.int64:
             return
 
-        self.check_grad_with_place(
-            self.place,
-            ['X'],
-            'Out',
-            no_grad_set=set("Y"),
-            max_relative_error=0.006, )
+        if self.dtype == np.float16:
+            self.check_grad_with_place(
+                self.place,
+                ['X'],
+                'Out',
+                no_grad_set=set("Y"),
+                max_relative_error=0.8, )
+        else:
+            self.check_grad_with_place(
+                self.place,
+                ['X'],
+                'Out',
+                no_grad_set=set("Y"),
+                max_relative_error=0.006, )
 
 
 class TestFP16ElementwiseAddOp(TestElementwiseAddOp):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py
index 461e15352e383..cbfc07f354479 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py
@@ -116,19 +116,13 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             self.place, ['Y'], 'Out', no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             self.place, ['X'], 'Out', no_grad_set=set("Y"))
 
@@ -213,15 +207,11 @@ def init_input_output(self):
         self.out = np.maximum(self.x, self.y.reshape(1, 1, 100))
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy])
 
     def test_check_grad_ingore_x(self):
-        if self.dtype == np.float16:
-            return
         _, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['Y'],
@@ -230,8 +220,6 @@ def test_check_grad_ingore_x(self):
             user_defined_grads=[dy])
 
     def test_check_grad_ingore_y(self):
-        if self.dtype == np.float16:
-            return
         dx, _ = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['X'],
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
index 51cf5cdaf6d1a..e191224df81ee 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
@@ -64,32 +64,41 @@ def test_check_output(self):
 
     def test_check_grad_normal(self):
         if self.dtype == np.float16:
-            return
-
-        self.check_grad_with_place(
-            self.place,
-            ['X', 'Y'],
-            'Out', )
+            self.check_grad_with_place(
+                self.place, ['X', 'Y'], 'Out', max_relative_error=0.5)
+        else:
+            self.check_grad_with_place(
+                self.place,
+                ['X', 'Y'],
+                'Out', )
 
     def test_check_grad_ingore_x(self):
         if self.dtype == np.float16:
-            return
-
-        self.check_grad_with_place(
-            self.place,
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"), )
+            self.check_grad_with_place(
+                self.place, ['Y'],
+                'Out',
+                no_grad_set=set("X"),
+                max_relative_error=0.9)
+        else:
+            self.check_grad_with_place(
+                self.place,
+                ['Y'],
+                'Out',
+                no_grad_set=set("X"), )
 
     def test_check_grad_ingore_y(self):
         if self.dtype == np.float16:
-            return
-
-        self.check_grad_with_place(
-            self.place,
-            ['X'],
-            'Out',
-            no_grad_set=set("Y"), )
+            self.check_grad_with_place(
+                self.place, ['X'],
+                'Out',
+                no_grad_set=set("Y"),
+                max_relative_error=0.1)
+        else:
+            self.check_grad_with_place(
+                self.place,
+                ['X'],
+                'Out',
+                no_grad_set=set("Y"), )
 
 
 class TestElementwiseMinOpFp16(TestElementwiseMinOp):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
index ce645f317d054..907e149c8b2c3 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
@@ -114,8 +114,6 @@ def init_input_output(self):
         self.out = np.power(self.x, self.y)
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy])
@@ -184,8 +182,6 @@ def init_input_output(self):
         self.out = np.power(self.x, self.y)
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy])
@@ -218,8 +214,6 @@ def init_input_output(self):
         self.out = np.power(self.x, self.y.reshape(1, 100, 1))
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy])
@@ -252,8 +246,6 @@ def init_input_output(self):
         self.out = np.power(self.x, self.y.reshape(100, 1, 1))
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy])
diff --git a/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py
index ccd5f0649d8dc..6be2fe0086b12 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py
@@ -50,8 +50,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
     def init_dtype(self):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
index 89ac9e09aa348..83b65630d801a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
@@ -34,7 +34,7 @@ def setUp(self):
 
         self.init_dtype()
         np.random.seed(SEED)
-        x = np.random.randn(3, 1, 7).astype(self.dtype)
+        x = np.random.randn(30, 1, 7).astype(self.dtype)
         out = np.tile(x, [1, 10, 1])
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
@@ -50,12 +50,8 @@ def init_dtype(self):
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
-    # TODO(ascendrc): Add grad test
-    # def test_check_grad(self):
-    #     if self.dtype == np.float16:
-    #         return
-    #     self.check_grad(['X'], 'Out')
-    #
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
 
 
 class TestExpandV2(TestExpand):
@@ -66,7 +62,7 @@ def setUp(self):
 
         self.init_dtype()
         np.random.seed(SEED)
-        x = np.random.randn(3, 1, 7).astype(self.dtype)
+        x = np.random.randn(30, 1, 7).astype(self.dtype)
         out = np.tile(x, [1, 10, 1])
         expand_times = np.array([1, 10, 1]).astype(np.int32)
 
@@ -145,7 +141,7 @@ def setUp(self):
 
         self.init_dtype()
         np.random.seed(SEED)
-        x = np.random.randn(3, 1, 7).astype(self.dtype)
+        x = np.random.randn(30, 1, 7).astype(self.dtype)
         out = np.tile(x, [1, 1, 1])
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
diff --git a/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py
index d7aafccc88cf8..f1d89cb8d561b 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py
@@ -59,9 +59,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-5)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
     def set_npu(self):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py
index 32042ba83a9f7..9495cdb8a55aa 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py
@@ -66,8 +66,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         # There is a problem that precision of grad result using float32
         # can't satisfy the default precision requirement 
         # when compared with numeric_grads, but the results on 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py
index 1c9f499d22db4..a9c195bb8cd29 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py
@@ -81,13 +81,9 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             self.place, ['Y'],
             'Out',
@@ -95,8 +91,6 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("residual"))
 
     def test_check_grad_ingore_y(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             self.place, ['X'],
             'Out',
diff --git a/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py
index 6e5b4c012053f..d02ddae461ba5 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py
@@ -78,8 +78,10 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(self.place, ['X'], 'Out')
+            self.check_grad_with_place(
+                self.place, ['X'], 'Out', max_relative_error=0.5)
+        else:
+            self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
 class TestLabelSmoothOpWithPriorDist(TestLabelSmoothOp):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py
index 590a961269989..a0472f9611eb0 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py
@@ -63,8 +63,10 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(self.place, ['X'], 'Out')
+            self.check_grad_with_place(
+                self.place, ['X'], 'Out', max_relative_error=0.006)
+        else:
+            self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
 class TestLeadyReluFP16(TestLeadyRelu):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
index 9534431e99a7a..5da3cb0ce5650 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
@@ -50,12 +50,8 @@ def init_dtype(self):
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
-    # TODO(ascendrc): Add grad test
-    # def test_check_grad(self):
-    #     if self.dtype == np.float16:
-    #         return
-    #     self.check_grad(['X'], 'Out')
-    #
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
 
 
 class TestLogFp16(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py
index f6baefec7f29e..10ec8621ffa58 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py
@@ -63,9 +63,13 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            self.place, ['X'], ['Out'], user_defined_grads=[self.x_grad])
+            self.check_grad_with_place(
+                self.place, ['X'], ['Out'],
+                user_defined_grads=[self.x_grad],
+                max_relative_error=0.02)
+        else:
+            self.check_grad_with_place(
+                self.place, ['X'], ['Out'], user_defined_grads=[self.x_grad])
 
 
 def test_class(op_type, typename):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
index fefff0974ae40..8ec9eb1cf3572 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
@@ -77,8 +77,10 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(self.place, ['W'], 'Out')
+            self.check_grad_with_place(
+                self.place, ['W'], 'Out', max_relative_error=0.01)
+        else:
+            self.check_grad_with_place(self.place, ['W'], 'Out')
 
 
 class TestLookupTableV2FP16(TestLookupTableV2):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py
index f3df1fca30749..ec51dcf3f8e3e 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py
@@ -39,10 +39,11 @@ def setUp(self):
         self.set_npu()
         self.out_size = None
         self.actual_shape = None
+        self.init_dtype()
         self.data_layout = 'NCHW'
         self.init_test_case()
         self.op_type = "nearest_interp_v2"
-        input_np = np.random.random(self.input_shape).astype("float32")
+        input_np = np.random.random(self.input_shape).astype(self.dtype)
 
         if self.data_layout == "NCHW":
             in_h = self.input_shape[2]
@@ -95,8 +96,21 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place, ['X'], 'Out', in_place=True, max_relative_error=0.006)
+        if self.dtype == np.float16:
+            self.check_grad_with_place(
+                self.place, ['X'],
+                'Out',
+                in_place=True,
+                max_relative_error=0.02)
+        else:
+            self.check_grad_with_place(
+                self.place, ['X'],
+                'Out',
+                in_place=True,
+                max_relative_error=0.006)
+
+    def init_dtype(self):
+        self.dtype = np.float32
 
     def init_test_case(self):
         self.interp_method = 'nearest'
@@ -108,6 +122,11 @@ def init_test_case(self):
         self.align_corners = False
 
 
+class TestNearestNeighborInterpFP16(TestNearestInterpOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
 class TestNearestNeighborInterpCase1(TestNearestInterpOp):
     def init_test_case(self):
         self.interp_method = 'nearest'
diff --git a/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py
index 2c41f09ff5148..8e28b3fe413b0 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py
@@ -54,9 +54,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-
         self.check_grad_with_place(
             self.place, ['X'], 'Out', max_relative_error=0.006)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py
index 3b75cba60b103..a7ca4edc524be 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py
@@ -51,8 +51,6 @@ def test_check_output(self):
             self.check_output_with_place(paddle.NPUPlace(0))
 
     def test_check_grad(self):
-        if self.dtype == "float16":
-            return
         self.check_grad_with_place(
             paddle.NPUPlace(0), ['X'], 'Out', user_defined_grads=self.gradient)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py
index 7d6c3b9bdb444..d1d2e8b3467be 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py
@@ -50,9 +50,10 @@ def test_check_output(self):
 
     def test_check_grad_normal(self):
         if self.dtype == np.float16:
-            return
-
-        self.check_grad_with_place(self.place, ['X'], 'Out')
+            self.check_grad_with_place(
+                self.place, ['X'], 'Out', max_relative_error=0.6)
+        else:
+            self.check_grad_with_place(self.place, ['X'], 'Out')
 
     def set_npu(self):
         self.__class__.use_npu = True
diff --git a/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
index 2b8550a88de59..4822abc3b25eb 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
@@ -67,9 +67,6 @@ def init_kernel_type(self):
             self.use_cudnn = False
             self.dtype = np.float16
 
-        def test_check_grad(self):
-            return
-
     cls_name = "{0}_{1}".format(parent.__name__, "Fp16Op")
     TestFp16Case.__name__ = cls_name
     globals()[cls_name] = TestFp16Case
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py
index e8f5de005d421..899d4ef43bd86 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py
@@ -40,8 +40,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             self.place, ['X'], 'Out', max_relative_error=0.01)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py
index 601a351c015f3..b1cb5e02a731f 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py
@@ -56,8 +56,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
     def init_dtype(self):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
index a2547808e6f16..c909b14b5141f 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
@@ -34,11 +34,12 @@ def setUp(self):
 
         self.init_dtype()
         np.random.seed(SEED)
-        x = np.random.rand(3, 2).astype(self.dtype)
-        out = x
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {}
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.maximum(x, 0)
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
     def set_npu(self):
@@ -50,32 +51,18 @@ def init_dtype(self):
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            self.check_grad_with_place(
+                self.place, ['X'], 'Out', max_relative_error=0.006)
+        else:
+            self.check_grad_with_place(self.place, ['X'], 'Out')
 
-class TestReluFp16(OpTest):
-    def setUp(self):
-        self.set_npu()
-        self.op_type = "relu"
-        self.place = paddle.NPUPlace(0)
-
-        self.init_dtype()
-        np.random.seed(SEED)
-        x = np.random.rand(3, 2).astype(self.dtype)
-        out = x
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {}
-        self.outputs = {'Out': out}
-
-    def set_npu(self):
-        self.__class__.use_npu = True
-        self.__class__.no_need_check_grad = True
 
+class TestReluFp16(TestRelu):
     def init_dtype(self):
         self.dtype = np.float16
 
-    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-5)
-
 
 class TestReluNeg(OpTest):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py
index 4516b25b59d9c..489f8bfb116a1 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py
@@ -44,8 +44,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             self.place, ['X'], 'Out', max_relative_error=0.01)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
index 611691109e187..a5b203b6eea2a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
@@ -58,12 +58,17 @@ def set_npu(self):
         self.place = paddle.NPUPlace(0)
 
     def test_check_output(self):
-        self.check_output_with_place(self.place)
+        if self.dtype == np.float16:
+            self.check_output_with_place(self.place)
+        else:
+            self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(self.place, ['Input'], 'Out')
+            self.check_grad_with_place(
+                self.place, ['Input'], 'Out', max_relative_error=0.02)
+        else:
+            self.check_grad_with_place(self.place, ['Input'], 'Out')
 
 
 class TestSliceOp2(TestSliceOp):
@@ -347,8 +352,10 @@ def test_check_output(self):
 
     def test_check_grad_normal(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(self.place, ['Input'], 'Out')
+            self.check_grad_with_place(
+                self.place, ['Input'], 'Out', max_relative_error=0.5)
+        else:
+            self.check_grad_with_place(self.place, ['Input'], 'Out')
 
 
 class TestSliceOpDecsDimFp16(TestSliceOpDecsDim):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
index 8d78ee6a97efd..f0ca778834576 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
@@ -87,8 +87,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         # fp32 has low precision, cpu and npu both need to relax the max_relative_error if using fp32
         self.check_grad_with_place(
             self.place, ['Logits'],
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
index acb99746d231d..24b34fa625c63 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
@@ -50,12 +50,11 @@ def init_dtype(self):
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
-    # TODO(ascendrc): Add grad test
-    # def test_check_grad(self):
-    #     if self.dtype == np.float16:
-    #         return
-    #     self.check_grad(['X'], 'Out')
-    #
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            self.check_grad(['X'], 'Out', max_relative_error=0.009)
+        else:
+            self.check_grad(['X'], 'Out', max_relative_error=0.009)
 
 
 class TestSqrtFp16(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
index caf55b4850f0b..170f6b6ca4f93 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
@@ -51,8 +51,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
index 55be94da2b7e0..375eef12291ec 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
@@ -50,12 +50,11 @@ def init_dtype(self):
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
-    # TODO(ascendrc): Add grad test
-    # def test_check_grad(self):
-    #     if self.dtype == np.float16:
-    #         return
-    #     self.check_grad(['X'], 'Out')
-    #
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            self.check_grad(['X'], 'Out', max_relative_error=0.009)
+        else:
+            self.check_grad(['X'], 'Out', max_relative_error=0.009)
 
 
 class TestTanhFp16(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 457f20ac5b06b..530ea2838a76f 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -715,10 +715,11 @@ def get_default(idx, all_params_number, defaults):
                 assert related_idx >= 0, "%d-th arguments don't have default value" % idx
                 return defaults[related_idx]
 
-            def remove_name(x):
-                if isinstance(x, list): return [i for i in x if i != 'name']
+            def filter_by_name(x):
+                names = set(['name', 'out', 'output'])
+                if isinstance(x, list): return [i for i in x if i not in names]
                 if isinstance(x, dict):
-                    return {k: v for k, v in x.items() if k != 'name'}
+                    return {k: v for k, v in x.items() if k not in names}
                 assert False, "Only support list or dict."
 
             def to_defaults_list(params, defaults):
@@ -728,7 +729,7 @@ def to_defaults_list(params, defaults):
             # Because we don't know the python api name of each arguments.
             # using parse_arg_and_kwargs, we can get the all api information we need.
             api_params, api_defaults = [
-                remove_name(item) for item in parse_arg_and_kwargs(api)
+                filter_by_name(item) for item in parse_arg_and_kwargs(api)
             ]
             api_defaults = to_defaults_list(api_params, api_defaults)
             inputs_sig, attrs_sig, outputs_sig = kernel_sig
@@ -784,10 +785,10 @@ def cal_python_api(python_api, args, kernel_sig):
             block = fluid.default_main_program().global_block()
             op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
             # prepare input variable
-            inputs = self.append_input_output_for_dygraph(op_proto, self.inputs,
-                                                          True, False, block)
+            eager_tensor_inputs = self.append_input_output_for_dygraph(
+                op_proto, self.inputs, True, False, block)
             # prepare output variable
-            outputs = self.append_input_output_for_dygraph(
+            eager_tensor_outputs = self.append_input_output_for_dygraph(
                 op_proto, self.outputs, False, False, block)
 
             # prepare attrbutes
@@ -798,13 +799,14 @@ def cal_python_api(python_api, args, kernel_sig):
                         attrs_outputs[attrs_name] = self.attrs[attrs_name]
 
             kernel_sig = _dygraph_tracer()._get_kernel_signature(
-                self.op_type, inputs, outputs, attrs_outputs)
+                self.op_type, eager_tensor_inputs, eager_tensor_outputs,
+                attrs_outputs)
 
             assert hasattr(
                 self, "python_api"
             ), "Please set the `self.python_api` if you want to compare python api output."
-            args = prepare_python_api_arguments(self.python_api, inputs,
-                                                attrs_outputs, kernel_sig)
+            args = prepare_python_api_arguments(
+                self.python_api, eager_tensor_inputs, attrs_outputs, kernel_sig)
             """ we directly return the cal_python_api value because the value is already tensor. 
             """
             return cal_python_api(self.python_api, args, kernel_sig)
@@ -1286,11 +1288,11 @@ def check_output_with_place(self,
             with _test_eager_guard():
                 eager_dygraph_outs = self._calc_dygraph_output(
                     place, no_check_set=no_check_set)
-            # we only check end2end api when check_eager=True
-            if hasattr(self, "python_api"):
-                api_outs = self._calc_python_api_output(place)
-                self._check_api_outs_by_dygraph_outs(api_outs, dygraph_outs,
-                                                     place)
+                # we only check end2end api when check_eager=True
+                if hasattr(self, "python_api"):
+                    api_outs = self._calc_python_api_output(place)
+                    self._check_api_outs_by_dygraph_outs(api_outs, dygraph_outs,
+                                                         place)
 
         outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
 
diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py
index 50ea065209422..6c964a828eed7 100644
--- a/python/paddle/fluid/tests/unittests/op_test_xpu.py
+++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py
@@ -123,17 +123,26 @@ def check_grad_with_place(self,
             return super().check_grad_with_place(
                 place, inputs_to_check, output_names, no_grad_set,
                 numeric_grad_delta, in_place, max_relative_error,
-                user_defined_grads, user_defined_grads, check_dygraph)
+                user_defined_grads, user_defined_grad_outputs, check_dygraph)
 
         a1 = self.get_grad_with_place(
-            place, inputs_to_check, output_names, no_grad_set=no_grad_set)
+            place,
+            inputs_to_check,
+            output_names,
+            no_grad_set=no_grad_set,
+            user_defined_grad_outputs=user_defined_grad_outputs)
         a2 = self.get_grad_with_place(
-            place, inputs_to_check, output_names, no_grad_set=no_grad_set)
+            place,
+            inputs_to_check,
+            output_names,
+            no_grad_set=no_grad_set,
+            user_defined_grad_outputs=user_defined_grad_outputs)
         a3 = self.get_grad_with_place(
             paddle.CPUPlace(),
             inputs_to_check,
             output_names,
-            no_grad_set=no_grad_set)
+            no_grad_set=no_grad_set,
+            user_defined_grad_outputs=user_defined_grad_outputs)
         self._assert_is_close(a1, a2, inputs_to_check, 0.00000001,
                               "Gradient Check On two xpu")
         self._assert_is_close(a1, a3, inputs_to_check, max_relative_error,
@@ -147,7 +156,7 @@ def get_grad_with_place(self,
                             numeric_grad_delta=0.005,
                             in_place=False,
                             max_relative_error=0.005,
-                            user_defined_grads=None,
+                            user_defined_grad_outputs=None,
                             check_dygraph=True):
         self.scope = core.Scope()
         op_inputs = self.inputs if hasattr(self, "inputs") else dict()
@@ -197,6 +206,10 @@ def get_grad_with_place(self,
         if not type(output_names) is list:
             output_names = [output_names]
 
-        analytic_grads = self._get_gradient(inputs_to_check, place,
-                                            output_names, no_grad_set)
+        analytic_grads = self._get_gradient(
+            inputs_to_check,
+            place,
+            output_names,
+            no_grad_set,
+            user_defined_grad_outputs=user_defined_grad_outputs)
         return analytic_grads
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py
new file mode 100644
index 0000000000000..91c340c35d478
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import os
+import numpy as np
+import random
+import socket
+
+import paddle
+import paddle.nn as nn
+from paddle.fluid.dygraph.nn import Linear
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+import paddle.distributed as dist
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.optimizer import SGD
+from paddle.fluid.initializer import NumpyArrayInitializer
+
+
+def net_is_used(port, ip='127.0.0.1'):
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    try:
+        s.connect((ip, port))
+        s.shutdown(2)
+        return True
+    except Exception as e:
+        return False
+
+
+def init_process_group(strategy=None):
+    nranks = ParallelEnv().nranks
+    rank = ParallelEnv().local_rank
+    is_master = True if rank == 0 else False
+    for port in range(20000, 21000):
+        if not net_is_used(port):
+            store = paddle.fluid.core.TCPStore("127.0.0.1", port, is_master,
+                                               nranks)
+            group = core.ProcessGroupNCCL(store, rank, nranks)
+            return group
+
+
+class LinearModel(nn.Layer):
+    def __init__(self, attr_list):
+        super(LinearModel, self).__init__()
+        self._linear1 = paddle.nn.Linear(
+            50, 30, weight_attr=attr_list[0], bias_attr=False)
+        self._linear2 = paddle.nn.Linear(
+            30, 10, weight_attr=attr_list[1], bias_attr=False)
+        self._linear3 = paddle.nn.Linear(
+            10, 10, weight_attr=attr_list[2], bias_attr=False)
+
+    def forward(self, x):
+        output = self._linear1(x)
+        output = self._linear2(output)
+        output = self._linear3(output)
+        return output
+
+
+class TestDistTraning(unittest.TestCase):
+    def test_multiple_gpus(self):
+        process_group = init_process_group()
+        self.generate_reducer("float32", process_group)
+        self.generate_reducer("float16", process_group)
+
+    def generate_reducer(self, dtype, process_group):
+        dev_id = ParallelEnv().dev_id
+        np.random.seed(2022 + dev_id)
+        paddle.set_default_dtype(dtype)
+
+        w_1 = paddle.ParamAttr(initializer=NumpyArrayInitializer(
+            np.random.rand(50, 30).astype(dtype)))
+        w_2 = paddle.ParamAttr(initializer=NumpyArrayInitializer(
+            np.random.rand(30, 10).astype(dtype)))
+        w_3 = paddle.ParamAttr(initializer=NumpyArrayInitializer(
+            np.random.rand(10, 10).astype(dtype)))
+
+        attr_list = [w_1, w_2, w_3]
+        inp = np.random.rand(10, 50).astype(dtype)
+
+        # original reducer
+        params_a = self.model_train(attr_list, inp)
+
+        # refactored reducer in eager mode
+        with _test_eager_guard():
+            params_b = self.model_train(
+                attr_list, inp, process_group=process_group)
+
+        for i in range(len(params_a)):
+            np.testing.assert_allclose(params_a[i].numpy(), params_b[i].numpy())
+
+    def model_train(self, attr_list, inp, process_group=None):
+        model = LinearModel(attr_list)
+        model = paddle.DataParallel(model, process_group=process_group)
+        optimizer = SGD(learning_rate=0.0003, parameters=model.parameters())
+
+        x = paddle.to_tensor(inp)
+        x.stop_gradient = False
+
+        for step in range(10):
+            y = model(x)
+            loss = y.mean()
+
+            loss.backward()
+            optimizer.step()
+            optimizer.clear_grad()
+
+        return model.parameters()
+
+
+class TestCatchErrors1(unittest.TestCase):
+    def test_multiple_gpus(self):
+        linear = paddle.nn.Linear(2, 4)
+        with _test_eager_guard():
+            self.assertRaises(RuntimeError, paddle.DataParallel, linear)
+
+
+class TestCatchErrors2(unittest.TestCase):
+    def test_multiple_gpus(self):
+        with _test_eager_guard():
+            linear = paddle.nn.Linear(2, 4)
+            self.assertRaises(RuntimeError, paddle.DataParallel, linear)
+
+
+if __name__ == '__main__':
+    dist.init_parallel_env()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py
new file mode 100644
index 0000000000000..214f41c78a3a5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import os
+
+import paddle
+import numpy as np
+import paddle.distributed as dist
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+from paddle.fluid.framework import _test_eager_guard
+from paddle.fluid.dygraph.parallel import ParallelEnv
+import paddle.fluid.core as core
+
+paddle.seed(1024)
+np.random.seed(2021)
+
+batch = 5
+in_dim = 10
+out_dim = 20
+
+
+def init_process_group(strategy=None):
+    nranks = ParallelEnv().nranks
+    rank = ParallelEnv().local_rank
+    is_master = True if rank == 0 else False
+    store = paddle.fluid.core.TCPStore("127.0.0.1", 6174, is_master, nranks)
+    group = core.ProcessGroupNCCL(store, rank, nranks)
+    return group
+
+
+class SimpleNet(fluid.Layer):
+    def __init__(self, train_id):
+        super(SimpleNet, self).__init__()
+        self.w1 = self.create_parameter(
+            shape=[in_dim, out_dim], dtype="float32")
+        self.w2 = self.create_parameter(
+            shape=[in_dim, out_dim], dtype="float32")
+        self.share_net = Linear(out_dim, 10)
+
+        self.unused_param = self.create_parameter(
+            shape=[out_dim, in_dim], dtype="float64")
+
+        # just for test sync_params_buffers
+        # self.register_buffer("queue", paddle.randn([10, 5]))
+        # self.queue = paddle.nn.functional.normalize(self.queue, axis=0)
+        # self.register_buffer("queue_ptr", paddle.zeros([1], 'int64'))
+
+        self.trainer_id = train_id
+
+    def forward(self, x):
+        is_use = (paddle.equal_all(
+            x, paddle.ones(shape=(batch, in_dim))).numpy()[0] and
+                  self.trainer_id == 1)
+
+        if is_use:
+            tmp = paddle.matmul(x, self.w1)
+        else:
+            tmp = paddle.matmul(x, self.w2)
+
+        return self.share_net(tmp)
+
+
+class TestDistTraning(unittest.TestCase):
+    def test_multiple_gpus(self):
+        dist.init_parallel_env()
+        self.trainer_id = dist.get_rank()
+
+        process_group = init_process_group()
+        self.pg = process_group
+        with _test_eager_guard():
+
+            model_a = SimpleNet(self.trainer_id)
+            model_b = SimpleNet(self.trainer_id)
+
+            state_dict = model_a.state_dict()
+            model_b.set_state_dict(state_dict)
+
+            model_a = paddle.DataParallel(
+                model_a,
+                find_unused_parameters=True,
+                process_group=process_group)
+            model_b = paddle.DataParallel(
+                model_b,
+                find_unused_parameters=True,
+                process_group=process_group)
+
+            ones_input = paddle.ones(shape=(batch, in_dim))
+            ones_input.stop_gradient = True
+
+            w1_grad_sum = np.zeros((in_dim, out_dim), dtype='float32')
+            w2_grad_sum = np.zeros((in_dim, out_dim), dtype='float32')
+
+            for step_id in range(5):
+                print("==============", step_id)
+                random_input = paddle.rand(shape=(batch, in_dim))
+                random_input.stop_gradient = True
+
+                if step_id % 2 == 0:
+                    out_a = model_a(random_input)
+                    out_b = model_b(random_input)
+                else:
+                    out_a = model_a(ones_input)
+                    out_b = model_b(ones_input)
+
+                out_a.sum().backward()
+                out_b.sum().backward()
+
+                self.check_gradient(model_a.parameters())
+                self.check_gradient(model_b.parameters())
+
+                # test acc gradient
+                w1_grad_sum = self.check_acc(model_a._layers.w1.grad,
+                                             w1_grad_sum,
+                                             model_b._layers.w1.grad)
+                w2_grad_sum = self.check_acc(model_a._layers.w2.grad,
+                                             w2_grad_sum,
+                                             model_b._layers.w2.grad)
+
+                model_a.clear_gradients()
+
+    def check_acc(self, grad, grad_sum, acc_grad):
+        if grad is not None:
+            grad_sum = grad_sum + grad.numpy()
+            acc_grad = acc_grad.numpy() if acc_grad is not None else None
+            np.testing.assert_allclose(grad_sum, acc_grad, rtol=1e-6)
+        return grad_sum
+
+    def print_trainer_0(self, *args):
+        if self.trainer_id == 0:
+            print(*args)
+
+    def broadcast_param(self, param, root):
+        self.pg.broadcast(param, root)
+        return param
+
+    def check_gradient(self, params):
+        other_param = []
+        for param in params:
+            if param.trainable and (param.grad is not None):
+                grad = param.grad
+                other_grad = self.broadcast_param(grad, root=1)
+                if self.trainer_id == 0:
+                    np.testing.assert_allclose(other_grad.numpy(), grad.numpy())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/process_group_gloo.py b/python/paddle/fluid/tests/unittests/process_group_gloo.py
index c62c4615f7470..b1f3a71ab3e94 100644
--- a/python/paddle/fluid/tests/unittests/process_group_gloo.py
+++ b/python/paddle/fluid/tests/unittests/process_group_gloo.py
@@ -47,9 +47,7 @@ def test_create_process_group_gloo(self):
             is_master = True if rank == 0 else False
             store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master,
                                                nranks, datetime.timedelta(0))
-            gloo_store = paddle.fluid.core.GlooStore(store)
-            opt = paddle.fluid.core.GlooOptions()
-            pg = paddle.fluid.core.ProcessGroupGloo(gloo_store, rank, nranks)
+            pg = paddle.fluid.core.ProcessGroupGloo(store, rank, nranks)
 
             # test allreduce sum
             # rank 0
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_concat.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_concat.py
index 737c085dde6ac..34b6f6dc8e545 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_concat.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_concat.py
@@ -20,6 +20,7 @@
 sys.path.append("../")
 from op_test import OpTest
 
+import paddle
 from paddle import fluid
 
 
@@ -115,4 +116,5 @@ def test_dtype():
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
new file mode 100644
index 0000000000000..b57f26776234e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
@@ -0,0 +1,297 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+import paddle.incubate.nn.functional as incubate_f
+
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+from paddle.fluid.dygraph.layers import Layer
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid import core
+from paddle.nn.initializer import Constant
+
+paddle.enable_static()
+
+
+def _set_var_distributed(var):
+    if var is None:
+        return
+
+    var.is_distributed = True
+
+    # NOTE: use current_block and find_var_recursive to support while_loop
+    startup_block = paddle.static.default_startup_program().current_block()
+    main_block = paddle.static.default_main_program().current_block()
+    startup_block._find_var_recursive(var.name).is_distributed = True
+    main_block._find_var_recursive(var.name).is_distributed = True
+
+
+class ParallelFusedMultiHeadAttention(Layer):
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 dropout_rate=0.5,
+                 attn_dropout_rate=0.5,
+                 kdim=None,
+                 vdim=None,
+                 normalize_before=False,
+                 need_weights=False,
+                 qkv_weight_attr=None,
+                 qkv_bias_attr=None,
+                 linear_weight_attr=None,
+                 linear_bias_attr=None,
+                 pre_ln_scale_attr=None,
+                 pre_ln_bias_attr=None,
+                 ln_scale_attr=None,
+                 ln_bias_attr=None,
+                 epsilon=1e-5,
+                 nranks=1,
+                 ring_id=-1,
+                 name=None):
+        super(ParallelFusedMultiHeadAttention, self).__init__()
+
+        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
+                               "but recieved {}".format(embed_dim))
+        assert num_heads > 0, ("Expected nhead to be greater than 0, "
+                               "but recieved {}".format(num_heads))
+
+        self.normalize_before = normalize_before
+        self._dtype = self._helper.get_default_dtype()
+        self._epsilon = epsilon
+        self._ring_id = ring_id
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.kdim = kdim
+        self.vdim = vdim
+        self.need_weights = need_weights
+        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+        assert need_weights == False, "Only support need_weight is False now."
+
+        # tensor model parallel
+        assert num_heads % nranks == 0
+        num_heads = num_heads // nranks
+
+        self.qkv_weight = self.create_parameter(
+            shape=[3, num_heads, self.head_dim, embed_dim],
+            attr=qkv_weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+        self.qkv_bias = self.create_parameter(
+            shape=[3, num_heads, self.head_dim],
+            attr=qkv_bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+        self.linear_weight = self.create_parameter(
+            shape=[num_heads * self.head_dim, embed_dim],
+            attr=linear_weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+        self.linear_bias = self.create_parameter(
+            shape=[embed_dim],
+            attr=linear_bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+
+        # tensor model parallel
+        if nranks > 1:
+            assert ring_id != -1
+            # column parallel
+            _set_var_distributed(self.qkv_weight)
+            _set_var_distributed(self.qkv_bias)
+            # row parallel
+            _set_var_distributed(self.linear_weight)
+
+        if normalize_before:
+            self.pre_ln_scale = self.create_parameter(
+                attr=pre_ln_scale_attr,
+                shape=[embed_dim],
+                default_initializer=Constant(value=1.0))
+            self.pre_ln_bias = self.create_parameter(
+                attr=pre_ln_bias_attr, shape=[embed_dim], is_bias=True)
+            self.ln_scale = None
+            self.ln_bias = None
+        else:
+            self.pre_ln_scale = None
+            self.pre_ln_bias = None
+            self.ln_scale = self.create_parameter(
+                attr=ln_scale_attr,
+                shape=[embed_dim],
+                default_initializer=Constant(value=1.0))
+            self.ln_bias = self.create_parameter(
+                attr=ln_bias_attr, shape=[embed_dim], is_bias=True)
+
+        self.dropout_rate = dropout_rate
+        self.attn_dropout_rate = attn_dropout_rate
+
+        self.name = name
+
+    def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
+        out = incubate_f.fused_multi_head_attention(
+            x=query,
+            qkv_weight=self.qkv_weight,
+            linear_weight=self.linear_weight,
+            pre_layer_norm=self.normalize_before,
+            pre_ln_scale=self.pre_ln_scale,
+            pre_ln_bias=self.pre_ln_bias,
+            ln_scale=self.ln_scale,
+            ln_bias=self.ln_bias,
+            pre_ln_epsilon=self._epsilon,
+            qkv_bias=self.qkv_bias,
+            linear_bias=self.linear_bias,
+            attn_mask=attn_mask,
+            dropout_rate=self.dropout_rate,
+            attn_dropout_rate=self.attn_dropout_rate,
+            ln_epsilon=self._epsilon,
+            training=self.training,
+            ring_id=self._ring_id,
+            name=self.name)
+        return out
+
+
+def get_param_attr(weight, bias):
+    weight_attr = paddle.ParamAttr(
+        initializer=fluid.initializer.NumpyArrayInitializer(weight))
+    bias_attr = paddle.ParamAttr(
+        initializer=fluid.initializer.NumpyArrayInitializer(bias))
+    return weight_attr, bias_attr
+
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+n_head = 2 * MODEL_PARALLEL_SIZE
+d_key = 4
+hidden = n_head * d_key
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    pre_ln_w = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
+    pre_ln_b = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
+    qkv_w = np.random.uniform(
+        -1, 1, size=(3, n_head, d_key, hidden)).astype(DTYPE)
+    qkv_b = np.random.uniform(-1, 1, size=(3, n_head, d_key)).astype(DTYPE)
+    linear_w = np.random.uniform(
+        -1, 1, size=(n_head * d_key, hidden)).astype(DTYPE)
+    linear_b = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
+
+    data.stop_gradient = False
+    if rank is not None:
+        start = 0 if rank == 0 else n_head // MODEL_PARALLEL_SIZE
+        end = start + n_head // MODEL_PARALLEL_SIZE
+        col_qkv_w = qkv_w[:, start:end, :, :]
+        col_qkv_b = qkv_b[:, start:end, :]
+        row_linear_w = linear_w[(start * d_key):(end * d_key), :]
+
+        pre_ln_w_attr, pre_ln_b_attr = get_param_attr(pre_ln_w, pre_ln_b)
+        qkv_w_attr, qkv_b_attr = get_param_attr(col_qkv_w, col_qkv_b)
+        linear_w_attr, linear_b_attr = get_param_attr(row_linear_w, linear_b)
+
+        attn = ParallelFusedMultiHeadAttention(
+            hidden,
+            n_head,
+            dropout_rate=0.0,
+            attn_dropout_rate=0.0,
+            normalize_before=False,
+            qkv_weight_attr=qkv_w_attr,
+            qkv_bias_attr=qkv_b_attr,
+            linear_weight_attr=linear_w_attr,
+            linear_bias_attr=linear_b_attr,
+            pre_ln_scale_attr=pre_ln_w_attr,
+            pre_ln_bias_attr=pre_ln_b_attr,
+            ln_scale_attr=pre_ln_w_attr,
+            ln_bias_attr=pre_ln_b_attr,
+            nranks=MODEL_PARALLEL_SIZE,
+            ring_id=0)
+        result = attn(data)
+    else:
+        pre_ln_w_attr, pre_ln_b_attr = get_param_attr(pre_ln_w, pre_ln_b)
+        qkv_w_attr, qkv_b_attr = get_param_attr(qkv_w, qkv_b)
+        linear_w_attr, linear_b_attr = get_param_attr(linear_w, linear_b)
+
+        attn = ParallelFusedMultiHeadAttention(
+            hidden,
+            n_head,
+            dropout_rate=0.0,
+            attn_dropout_rate=0.0,
+            normalize_before=False,
+            qkv_weight_attr=qkv_w_attr,
+            qkv_bias_attr=qkv_b_attr,
+            linear_weight_attr=linear_w_attr,
+            linear_bias_attr=linear_b_attr,
+            pre_ln_scale_attr=pre_ln_w_attr,
+            pre_ln_bias_attr=pre_ln_b_attr,
+            ln_scale_attr=pre_ln_w_attr,
+            ln_bias_attr=pre_ln_b_attr)
+        result = attn(data)
+
+    predict = paddle.sum(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        seq_len = 2
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, seq_len, hidden], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([seq_len, hidden]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/test_Tensor_type.py b/python/paddle/fluid/tests/unittests/test_Tensor_type.py
index 59395b94279ea..f1427d29782b9 100644
--- a/python/paddle/fluid/tests/unittests/test_Tensor_type.py
+++ b/python/paddle/fluid/tests/unittests/test_Tensor_type.py
@@ -39,6 +39,7 @@ def test_type_Tensor(self):
 
         tensorx = paddle.tensor.logic.Tensor(inx)
         typex_str = str(type(tensorx))
+
         expectx = "<class 'paddle.Tensor'>"
         self.assertEqual((typex_str == expectx), True)
 
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index ecac22553cbcd..d05c9a3c313bb 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -1202,4 +1202,5 @@ def test_main(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_addmm_op.py b/python/paddle/fluid/tests/unittests/test_addmm_op.py
index 6238d7dd4a1f4..dcf07f4953200 100644
--- a/python/paddle/fluid/tests/unittests/test_addmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_addmm_op.py
@@ -27,6 +27,7 @@ class TestAddMMOp(OpTest):
     # test basic
     def setUp(self):
         self.op_type = "addmm"
+        self.python_api = paddle.addmm
         self.dtype = np.float64
         self.init_dtype_type()
         self.inputs = {
@@ -43,19 +44,19 @@ def init_dtype_type(self):
         pass
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input', 'X', 'Y'], 'Out')
+        self.check_grad(['Input', 'X', 'Y'], 'Out', check_eager=False)
 
     def test_check_grad_x(self):
-        self.check_grad(['X'], 'Out', no_grad_set=None)
+        self.check_grad(['X'], 'Out', no_grad_set=None, check_eager=False)
 
     def test_check_grad_y(self):
-        self.check_grad(['Y'], 'Out', no_grad_set=None)
+        self.check_grad(['Y'], 'Out', no_grad_set=None, check_eager=False)
 
     def test_check_grad_input(self):
-        self.check_grad(['Input'], 'Out', no_grad_set=None)
+        self.check_grad(['Input'], 'Out', no_grad_set=None, check_eager=False)
 
 
 class TestAddMMOpError(unittest.TestCase):
@@ -167,6 +168,7 @@ class TestAddMMOp2(TestAddMMOp):
     # test alpha and beta
     def setUp(self):
         self.op_type = "addmm"
+        self.python_api = paddle.addmm
         self.dtype = np.float64
         self.init_dtype_type()
         self.inputs = {
@@ -252,4 +254,5 @@ def test_error1():
 '''
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_atan2_op.py b/python/paddle/fluid/tests/unittests/test_atan2_op.py
index b29ab822f25de..ca0e2d2ba6dda 100644
--- a/python/paddle/fluid/tests/unittests/test_atan2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_atan2_op.py
@@ -36,6 +36,7 @@ def atan2_grad(x1, x2, dout):
 class TestAtan2(OpTest):
     def setUp(self):
         self.op_type = "atan2"
+        self.python_api = paddle.atan2
         self.init_dtype()
 
         x1 = np.random.uniform(-1, -0.1, [15, 17]).astype(self.dtype)
@@ -46,10 +47,10 @@ def setUp(self):
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
-        self.check_grad(['X1', 'X2'], 'Out')
+        self.check_grad(['X1', 'X2'], 'Out', check_eager=True)
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def init_dtype(self):
         self.dtype = np.float64
@@ -66,7 +67,8 @@ def test_check_grad(self):
                 'Out',
                 user_defined_grads=atan2_grad(self.inputs['X1'],
                                               self.inputs['X2'],
-                                              1 / self.inputs['X1'].size))
+                                              1 / self.inputs['X1'].size),
+                check_eager=True)
 
 
 class TestAtan2_float16(TestAtan2_float):
@@ -129,4 +131,5 @@ def run(place):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index b440e745b1082..789cfa82658f4 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -451,4 +451,5 @@ def test_to_api_numpy_dtype(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bce_loss.py b/python/paddle/fluid/tests/unittests/test_bce_loss.py
index ea1a22780f093..1051fa9c1aefa 100644
--- a/python/paddle/fluid/tests/unittests/test_bce_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_bce_loss.py
@@ -244,4 +244,5 @@ def init_test_cast(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
index aba95a68ab790..9b99e553d182b 100644
--- a/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
+++ b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
@@ -17,5 +17,4 @@
 set -e
 # use default values
 # FIXME: random fails on Unknown command lines -c (or -m).
-launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
-CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} c_comm_init_op.py
+CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch c_comm_init_op.py
diff --git a/python/paddle/fluid/tests/unittests/test_collective_process_group.py b/python/paddle/fluid/tests/unittests/test_collective_process_group.py
index 58baa0a2fa944..e00f90f4b0d5f 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_process_group.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_process_group.py
@@ -25,6 +25,9 @@ def test_process_group_nccl(self):
     def test_process_group_gloo(self):
         self.run_mnist_2gpu('process_group_gloo.py')
 
+    def test_init_process_group(self):
+        self.run_mnist_2gpu('init_process_group.py')
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index f92465b739a2a..bd9ec6b663f60 100755
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -30,12 +30,13 @@ def setUp(self):
             a = numpy.random.random(size=(10, 7)).astype(typename)
             b = numpy.random.random(size=(10, 7)).astype(typename)
             c = callback(a, b)
+            self.python_api = eval("paddle." + op_type)
             self.inputs = {'X': a, 'Y': b}
             self.outputs = {'Out': c}
             self.op_type = op_type
 
         def test_output(self):
-            self.check_output()
+            self.check_output(check_eager=False)
 
         def test_errors(self):
             paddle.enable_static()
@@ -338,4 +339,5 @@ def test_place_2(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_create_parameter.py b/python/paddle/fluid/tests/unittests/test_create_parameter.py
index 763fb64816c9c..199558acd4ef6 100644
--- a/python/paddle/fluid/tests/unittests/test_create_parameter.py
+++ b/python/paddle/fluid/tests/unittests/test_create_parameter.py
@@ -18,6 +18,7 @@
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 from paddle.fluid import ParamAttr, initializer
+import paddle
 
 
 class TestCreateParameterError(unittest.TestCase):
@@ -50,4 +51,5 @@ def test_default_initializer():
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cross_op.py b/python/paddle/fluid/tests/unittests/test_cross_op.py
index 8e53a36f0510d..6cba72213ff97 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_op.py
@@ -26,6 +26,7 @@
 class TestCrossOp(OpTest):
     def setUp(self):
         self.op_type = "cross"
+        self.python_api = paddle.cross
         self.initTestCase()
         self.inputs = {
             'X': np.random.random(self.shape).astype(self.dtype),
@@ -47,10 +48,10 @@ def init_output(self):
         self.outputs = {'Out': np.array(z_list).reshape(self.shape)}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out')
+        self.check_grad(['X', 'Y'], 'Out', check_eager=False)
 
 
 class TestCrossOpCase1(TestCrossOp):
@@ -114,14 +115,14 @@ def test_cross_api(self):
     def test_dygraph_api(self):
         self.input_data()
         # case 1:
-        with fluid.dygraph.guard():
-            x = fluid.dygraph.to_variable(self.data_x)
-            y = fluid.dygraph.to_variable(self.data_y)
-            z = paddle.cross(x, y)
-            np_z = z.numpy()
-        expect_out = np.array([[-1.0, -1.0, -1.0], [2.0, 2.0, 2.0],
-                               [-1.0, -1.0, -1.0]])
-        self.assertTrue(np.allclose(expect_out, np_z))
+        # with fluid.dygraph.guard():
+        #     x = fluid.dygraph.to_variable(self.data_x)
+        #     y = fluid.dygraph.to_variable(self.data_y)
+        #     z = paddle.cross(x, y)
+        #     np_z = z.numpy()
+        # expect_out = np.array([[-1.0, -1.0, -1.0], [2.0, 2.0, 2.0],
+        #                        [-1.0, -1.0, -1.0]])
+        # self.assertTrue(np.allclose(expect_out, np_z))
 
         # case 2:
         with fluid.dygraph.guard():
@@ -135,4 +136,5 @@ def test_dygraph_api(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ctc_align.py b/python/paddle/fluid/tests/unittests/test_ctc_align.py
index f5934debfd7b6..ffc5bc184efc2 100644
--- a/python/paddle/fluid/tests/unittests/test_ctc_align.py
+++ b/python/paddle/fluid/tests/unittests/test_ctc_align.py
@@ -20,6 +20,7 @@
 from op_test import OpTest
 from test_softmax_op import stable_softmax
 import paddle.fluid as fluid
+import paddle
 
 
 def CTCAlign(input, lod, blank, merge_repeated, padding=0, input_length=None):
@@ -229,4 +230,5 @@ def test_bad_x():
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
index bc280a01890d4..83a25b71626e1 100644
--- a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
+++ b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid.dygraph as dg
 from op_test import OpTest
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestTensorBackward(unittest.TestCase):
@@ -29,7 +30,7 @@ def setUp(self):
         if paddle.is_compiled_with_cuda():
             self._places.append(paddle.CUDAPlace(0))
 
-    def test_tensor_backward(self):
+    def func_tensor_backward(self):
         for dtype in self._dtypes:
             x = np.random.random([2, 100]).astype(dtype)
             y = np.random.random([100, 2]).astype(dtype)
@@ -48,6 +49,11 @@ def test_tensor_backward(self):
 
                     self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy()))
 
+    def test_tensor_backward(self):
+        with _test_eager_guard():
+            self.func_tensor_backward()
+        self.func_tensor_backward()
+
 
 class TestBackwardAPI(unittest.TestCase):
     def setUp(self):
@@ -56,7 +62,7 @@ def setUp(self):
         if paddle.is_compiled_with_cuda():
             self._places.append(paddle.CUDAPlace(0))
 
-    def test_backward_api(self):
+    def func_backward_api(self):
         for dtype in self._dtypes:
             x = np.random.random([2, 2]).astype(dtype)
             y = np.random.random([2, 2]).astype(dtype)
@@ -78,7 +84,12 @@ def test_backward_api(self):
                     self.assertTrue(
                         np.allclose(x_grad * 2, x_tensor.grad.numpy()))
 
-    def test_backward_single_tensor(self):
+    def test_backward_api(self):
+        with _test_eager_guard():
+            self.func_backward_api()
+        self.func_backward_api()
+
+    def func_backward_single_tensor(self):
         for dtype in self._dtypes:
             x = np.random.random([2, 2]).astype(dtype)
             y = np.random.random([2, 2]).astype(dtype)
@@ -97,7 +108,12 @@ def test_backward_single_tensor(self):
 
                     self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy()))
 
-    def test_backward_none_grad_tensor(self):
+    def test_backward_single_tensor(self):
+        with _test_eager_guard():
+            self.func_backward_single_tensor()
+        self.func_backward_single_tensor()
+
+    def func_backward_none_grad_tensor(self):
         for dtype in self._dtypes:
             x = np.random.random([2, 2]).astype(dtype)
             y = np.random.random([2, 2]).astype(dtype)
@@ -115,7 +131,12 @@ def test_backward_none_grad_tensor(self):
 
                     self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy()))
 
-    def test_backward_accumulator_with_init_grad(self):
+    def test_backward_none_grad_tensor(self):
+        with _test_eager_guard():
+            self.func_backward_none_grad_tensor()
+        self.func_backward_none_grad_tensor()
+
+    def func_backward_accumulator_with_init_grad(self):
         for dtype in self._dtypes:
             x = np.random.random([10, ]).astype(dtype)
             y_grad = np.random.random([10, ]).astype(dtype)
@@ -134,11 +155,14 @@ def test_backward_accumulator_with_init_grad(self):
 
                     y = x**2
                     z = x**3
-                    x_grad = 2 * x_tensor * (
-                        y_grad_tensor + 3 * y_tensor * y_tensor * z_grad_tensor)
+                    x_grad = 2 * x * (y_grad + 3 * y * y * z_grad)
 
-                    self.assertTrue(
-                        np.allclose(x_grad.numpy(), x_tensor.grad.numpy()))
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy()))
+
+    def test_backward_accumulator_with_init_grad(self):
+        with _test_eager_guard():
+            self.func_backward_accumulator_with_init_grad()
+        self.func_backward_accumulator_with_init_grad()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_diag_v2.py b/python/paddle/fluid/tests/unittests/test_diag_v2.py
index 0371fa054282b..74e73ca5cdf5a 100644
--- a/python/paddle/fluid/tests/unittests/test_diag_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_diag_v2.py
@@ -27,6 +27,7 @@
 class TestDiagV2Op(OpTest):
     def setUp(self):
         self.op_type = "diag_v2"
+        self.python_api = paddle.diag
         self.x = np.random.rand(10, 10)
         self.offset = 0
         self.padding_value = 0.0
@@ -44,6 +45,10 @@ def test_check_output(self):
         paddle.enable_static()
         self.check_output(check_eager=True)
 
+    def test_check_grad(self):
+        paddle.enable_static()
+        self.check_grad(['X'], 'Out', check_eager=True)
+
     def init_config(self):
         pass
 
@@ -62,14 +67,14 @@ def init_config(self):
 
 class TestDiagV2OpCase3(TestDiagV2Op):
     def init_config(self):
-        self.x = np.random.randint(-10, 10, size=(10, 10))
+        self.x = np.random.randint(-10, 10, size=(10, 10)).astype("float64")
         self.out = np.diag(self.x, self.offset)
 
 
 class TestDiagV2OpCase4(TestDiagV2Op):
     def init_config(self):
         self.x = np.random.rand(100)
-        self.padding_value = 8
+        self.padding_value = 2
         n = self.x.size
         self.out = self.padding_value * np.ones((n, n)) + np.diag(
             self.x, self.offset) - np.diag(self.padding_value * np.ones(n))
@@ -263,4 +268,5 @@ def test_gpu(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_diff_op.py b/python/paddle/fluid/tests/unittests/test_diff_op.py
index 1ae780f488d2d..4a96827bd7c3c 100644
--- a/python/paddle/fluid/tests/unittests/test_diff_op.py
+++ b/python/paddle/fluid/tests/unittests/test_diff_op.py
@@ -211,4 +211,5 @@ def set_args(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index f670f7c38097b..fd2f642b770d6 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -933,5 +933,65 @@ def test_static(self):
             self.check_static_result(place=place)
 
 
+class TestDropoutBackward(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def cal_grad_upscale_train(self, mask, prob):
+        return mask.astype("float32") / (1 - prob)
+
+    def cal_grad_downscale_in_infer(self, mask):
+        return mask.astype("float32")
+
+    def test_backward_downscale_in_infer(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+
+                input = paddle.uniform([40, 40], dtype="float32")
+                input.stop_gradient = False
+                out, mask = core.ops.dropout(input, 'dropout_prob', 0.5)
+                out.backward()
+
+                self.assertTrue(
+                    np.array_equal(input.gradient(
+                    ), self.cal_grad_downscale_in_infer(mask.numpy())))
+
+    def test_backward_upscale_train(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+
+                prob = 0.5
+                input = paddle.uniform([40, 40], dtype="float32")
+                input.stop_gradient = False
+                out, mask = core.ops.dropout(input, 'dropout_prob', prob,
+                                             "dropout_implementation",
+                                             "upscale_in_train")
+                out.backward()
+
+                self.assertTrue(
+                    np.allclose(input.gradient(
+                    ), self.cal_grad_upscale_train(mask.numpy(), prob)))
+
+    def test_backward_upscale_train_2(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+
+                prob = 0.3
+                input = paddle.uniform([40, 40], dtype="float32")
+                input.stop_gradient = False
+                out, mask = core.ops.dropout(input, 'dropout_prob', prob,
+                                             "dropout_implementation",
+                                             "upscale_in_train")
+                out.backward()
+
+                self.assertTrue(
+                    np.allclose(input.gradient(
+                    ), self.cal_grad_upscale_train(mask.numpy(), prob)))
+
+
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
index f95546f15f002..27d82fcc8903b 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
@@ -190,4 +190,5 @@ def test_check_output(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index 156fdcb9b0abe..98ef339e04535 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -50,9 +50,9 @@ def test_retain_grad_and_run_backward(self):
             data_eager.retain_grads()
 
             out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True)
-            self.assertFalse(data_eager.grad._is_initialized())
+            self.assertIsNone(data_eager.grad)
             out_eager.backward(grad_eager, False)
-            self.assertTrue(data_eager.grad._is_initialized())
+            self.assertIsNotNone(data_eager.grad)
             self.assertTrue(np.array_equal(data_eager.grad.numpy(), input_data))
 
     def test_retain_grad_and_run_backward_raises(self):
@@ -72,7 +72,7 @@ def test_retain_grad_and_run_backward_raises(self):
             data_eager.retain_grads()
 
             out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True)
-            self.assertFalse(data_eager.grad._is_initialized())
+            self.assertIsNone(data_eager.grad)
             with self.assertRaisesRegexp(
                     AssertionError,
                     "The type of grad_tensor must be paddle.Tensor"):
@@ -632,13 +632,13 @@ def test_copy_and_copy_to(self):
             tensor2.persistable = True
             tensor2.stop_gradient = False
             if core.is_compiled_with_cuda():
-                tensor3 = tensor2._copy_to(True, core.CUDAPlace(0))
+                tensor3 = tensor2._copy_to(core.CUDAPlace(0), True)
                 self.assertTrue(np.array_equal(tensor3.numpy(), arr2))
                 self.assertTrue(tensor3.persistable, True)
                 self.assertTrue(tensor3.stop_gradient, True)
                 self.assertTrue(tensor3.place.is_gpu_place())
             else:
-                tensor3 = tensor2._copy_to(True, core.CPUPlace())
+                tensor3 = tensor2._copy_to(core.CPUPlace(), True)
                 self.assertTrue(np.array_equal(tensor3.numpy(), arr2))
                 self.assertTrue(tensor3.persistable, True)
                 self.assertTrue(tensor3.stop_gradient, True)
diff --git a/python/paddle/fluid/tests/unittests/test_einsum.py b/python/paddle/fluid/tests/unittests/test_einsum.py
index 13e763bee6305..43b5ce96a3901 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum.py
@@ -26,14 +26,14 @@ def setUp(self):
     def test_diagonalize_errors(self):
         a = np.arange(4 * 3 * 4 * 4).reshape(4, 3, 4, 4).astype('float')
         a = paddle.to_tensor(a)
-        with self.assertRaisesRegex(AssertionError, (
-                'Diagonal and trace not implemented yet.')):
+        with self.assertRaisesRegex(AssertionError,
+                                    ('Duplicate labels are not supported.')):
             paddle.einsum('...ii->...i', a)
-        with self.assertRaisesRegex(AssertionError, (
-                'Diagonal and trace not implemented yet.')):
+        with self.assertRaisesRegex(AssertionError,
+                                    ('Duplicate labels are not supported.')):
             paddle.einsum('i...i', a)
-        with self.assertRaisesRegex(AssertionError, (
-                'Diagonal and trace not implemented yet.')):
+        with self.assertRaisesRegex(AssertionError,
+                                    ('Duplicate labels are not supported.')):
             paddle.einsum('i...i->i...', a)
 
     def test_param_errors(self):
@@ -396,6 +396,51 @@ def test_large_nops(self):
         self.check_output('a...b,b...c,c...a', a, a, a)
         self.check_output('...ab,...ba,...ab,...ab', a, a, a, a)
 
+    def test_static_graph(self):
+        paddle.enable_static()
+        fluid = paddle.fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.place = fluid.CUDAPlace(0)
+        else:
+            self.place = fluid.CPUPlace()
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            a = paddle.static.data(
+                name='a', shape=[3, None, None, None], dtype='float')
+            b = paddle.static.data(
+                name='b', shape=[2, None, None, None], dtype='float')
+            c = paddle.static.data(
+                name='c', shape=[None, None, 2, None], dtype='float')
+            d = paddle.static.data(
+                name='d', shape=[None, None, 5], dtype='float')
+            e = paddle.static.data(
+                name='e', shape=[None, 2, None], dtype='float')
+
+            outs = []
+            outs.append(paddle.einsum("ibnd,jbnd->bnij", a, b))
+            outs.append(paddle.einsum('...ik, ...j', c, d))
+            outs.append(paddle.einsum('...kj, ...ik', d, e))
+            outs.append(paddle.einsum('ijk..., ikj', c, e))
+            outs.append(paddle.einsum('ijk..., ikj->...ij', c, e))
+        exe = fluid.Executor(self.place)
+        exe.run(startup)
+        a = np.arange(72).reshape(3, 2, 3, 4).astype('float')
+        b = np.arange(48).reshape(2, 2, 3, 4).astype('float')
+        c = np.arange(48).reshape(2, 3, 2, 4).astype('float')
+        d = np.arange(30).reshape(2, 3, 5).astype('float')
+        e = np.arange(12).reshape(2, 2, 3).astype('float')
+        feeds = {'a': a, 'b': b, 'c': c, 'd': d, 'e': e}
+        actual = exe.run(main, feed=feeds, fetch_list=[outs])
+        expect = []
+        expect.append(np.einsum("ibnd,jbnd->bnij", a, b))
+        expect.append(np.einsum('...ik, ...j', c, d))
+        expect.append(np.einsum('...kj, ...ik', d, e))
+        expect.append(np.einsum('ijk..., ikj', c, e))
+        expect.append(np.einsum('ijk..., ikj->...ij', c, e))
+        for a, e in zip(actual, expect):
+            self.check_output_equal(a, e)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index d1d391a3949ea..909e00d1a316a 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.fluid.core as core
-from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
@@ -40,16 +40,24 @@ def setUp(self):
         self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
         self.outputs = {'Out': self.out}
 
+    def check_eager(self):
+        return (self.use_mkldnn == False and self.axis == -1)
+
     def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=(self.use_mkldnn == False))
+        self.check_output(
+            check_dygraph=(self.use_mkldnn == False),
+            check_eager=self.check_eager())
 
     def test_check_grad_normal(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         if self.dtype == np.float16:
             return
         self.check_grad(
-            ['X', 'Y'], 'Out', check_dygraph=(self.use_mkldnn == False))
+            ['X', 'Y'],
+            'Out',
+            check_dygraph=(self.use_mkldnn == False),
+            check_eager=self.check_eager())
 
     def test_check_grad_ingore_x(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
@@ -59,7 +67,8 @@ def test_check_grad_ingore_x(self):
             ['Y'],
             'Out',
             no_grad_set=set("X"),
-            check_dygraph=(self.use_mkldnn == False))
+            check_dygraph=(self.use_mkldnn == False),
+            check_eager=self.check_eager())
 
     def test_check_grad_ingore_y(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
@@ -69,7 +78,8 @@ def test_check_grad_ingore_y(self):
             ['X'],
             'Out',
             no_grad_set=set('Y'),
-            check_dygraph=(self.use_mkldnn == False))
+            check_dygraph=(self.use_mkldnn == False),
+            check_eager=self.check_eager())
 
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
@@ -123,19 +133,21 @@ def setUp(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place(place, check_eager=False)
 
     def test_check_grad_normal(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+        self.check_grad_with_place(place, ['X', 'Y'], 'Out', check_eager=False)
 
     def test_check_grad_ingore_x(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['Y'], 'Out', no_grad_set=set("X"))
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', no_grad_set=set("X"), check_eager=False)
 
     def test_check_grad_ingore_y(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['X'], 'Out', no_grad_set=set('Y'))
+        self.check_grad_with_place(
+            place, ['X'], 'Out', no_grad_set=set('Y'), check_eager=False)
 
 
 @skip_check_grad_ci(
@@ -586,7 +598,7 @@ def init_grad_input_output(self):
         self.grad_y = self.grad_out
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad_normal(self):
         self.check_grad(
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
index 00967cb503fe5..b35b2840ed30a 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@@ -23,7 +23,7 @@
 from paddle.fluid import Program, compiler, program_guard
 from paddle.fluid.op import Operator
 
-from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 
 
 class ElementwiseMulOp(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_exponential_op.py b/python/paddle/fluid/tests/unittests/test_exponential_op.py
index ccbc0a1676302..7a3ae203be62d 100644
--- a/python/paddle/fluid/tests/unittests/test_exponential_op.py
+++ b/python/paddle/fluid/tests/unittests/test_exponential_op.py
@@ -209,4 +209,5 @@ def test_fixed_random_number(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index d2bffbe074f2a..0ae005430e03b 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -213,9 +213,9 @@ def test_sharding_amp_asp_optimizer(self):
             set(parameters),
             set([
                 'fc_2.b_0', 'num_good_steps_0', 'fc_2.w_0', 'loss_scaling_0',
-                'num_bad_steps_0', 'fc_2.w_0_velocity_0', 'fc_2.w_0_asp_mask',
-                'learning_rate_0', 'fc_1.b_0', 'fc_1.w_0_asp_mask',
-                'fc_0.w_0_asp_mask', 'fc_1.b_0_velocity_0',
+                'num_bad_steps_0', 'fc_2.w_0_velocity_0', 'fc_2.w_0.asp_mask',
+                'learning_rate_0', 'fc_1.b_0', 'fc_1.w_0.asp_mask',
+                'fc_0.w_0.asp_mask', 'fc_1.b_0_velocity_0',
                 'fc_2.b_0_velocity_0'
             ]))
         self.assertEqual(ops, [
diff --git a/python/paddle/fluid/tests/unittests/test_fmin_op.py b/python/paddle/fluid/tests/unittests/test_fmin_op.py
index 5cdf096be6708..7231823c37532 100644
--- a/python/paddle/fluid/tests/unittests/test_fmin_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fmin_op.py
@@ -189,3 +189,8 @@ def test_check_grad_ingore_y(self):
         """test_check_grad_ingore_y"""
         self.check_grad(
             ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
index 443703aa937d8..a3ae2a20dba23 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
@@ -70,10 +70,12 @@ def config(self):
         self.attn_mask_type = np.float64
         self.pre_layer_norm = False
         self.has_attn_mask = True
+        self.has_cache_kv = False
         self.training = True
 
         self.batch_size = 8
         self.query_length = 128
+        self.cache_length = 128
         self.head_dim = 64
         self.num_heads = 16
         self.embed_dim = self.head_dim * self.num_heads
@@ -88,10 +90,22 @@ def config(self):
     def generate_input_data(self):
         self.query = np.random.rand(self.batch_size, self.query_length,
                                     self.embed_dim).astype(self.x_type)
+        out_seq_len = self.key_length
+        if self.has_cache_kv:
+            assert self.training is False, ValueError(
+                'cache_kv can only used in inference')
+            self.cache_kv = np.random.rand(2, self.batch_size, self.num_heads,
+                                           self.cache_length,
+                                           self.head_dim).astype(self.x_type)
+            out_seq_len += self.cache_length
+        else:
+            self.cache_kv = None
+
         if self.has_attn_mask:
+            # [B, n_head, seq_len, out_seq_len]
             self.attn_mask = np.ones(
                 (self.batch_size, self.num_heads, self.query_length,
-                 self.key_length),
+                 out_seq_len),
                 dtype=self.attn_mask_type)
             if self.attn_mask_type == np.int64:
                 self.attn_mask = np.tril(self.attn_mask)
@@ -110,6 +124,11 @@ def generate_input_data(self):
     def GetBaselineOut(self):
         paddle.disable_static(place=paddle.CUDAPlace(0))
         tensor_query = paddle.to_tensor(self.query, stop_gradient=False)
+
+        cache_kv = None
+        if self.has_cache_kv:
+            cache_kv = paddle.to_tensor(self.cache_kv, stop_gradient=False)
+
         if self.has_attn_mask:
             attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False)
         else:
@@ -130,6 +149,18 @@ def GetBaselineOut(self):
         v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
         v_out = tensor.transpose(x=v, perm=[0, 2, 1, 3])
 
+        if self.has_cache_kv:
+            # [1, B, n_head, cache_seq_len, head_dim]
+            cache_k, cache_v = paddle.split(cache_kv, 2)
+            cache_k = paddle.squeeze(cache_k, axis=0)
+            cache_v = paddle.squeeze(cache_v, axis=0)
+            # [B, n_head, cache_seq_len + seq_len, head_dim]
+            # out_seq_len = cache_seq_len + seq_len
+            k_out = paddle.concat([cache_k, k_out], axis=-2)
+            v_out = paddle.concat([cache_v, v_out], axis=-2)
+
+        # [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
+        # --> [B, n_head, seq_len, out_seq_len]
         qk_out = layers.matmul(
             x=q_out, y=k_out, transpose_y=True, alpha=self.head_dim**-0.5)
 
@@ -146,6 +177,8 @@ def GetBaselineOut(self):
                 self.dropout_prob,
                 training=self.training,
                 mode="upscale_in_train")
+            # [B, n_head, seq_len, out_seq_len] * [B, n_head, out_seq_len, head_dim]
+            # --> [B, n_head, seq_len, head_dim]
             qktv_out = tensor.matmul(dropout_out, v_out)
         else:
             qktv_out = tensor.matmul(softmax_out, v_out)
@@ -160,6 +193,10 @@ def GetBaselineOut(self):
             final_out = self.norm1(residual_out)
         else:
             final_out = residual_out
+
+        if self.has_cache_kv:
+            return final_out
+
         paddle.autograd.backward(
             [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
         return final_out, tensor_query.grad
@@ -206,6 +243,9 @@ def GetFusedAttentionOut(self):
             (3, self.num_heads, self.head_dim, self.embed_dim))
 
         x = paddle.to_tensor(self.query, stop_gradient=False)
+        cache_kv = None
+        if self.has_cache_kv:
+            cache_kv = paddle.to_tensor(self.cache_kv, stop_gradient=False)
         if self.has_attn_mask:
             attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False)
         else:
@@ -219,8 +259,12 @@ def GetFusedAttentionOut(self):
         final_out = incubate_f.fused_multi_head_attention(
             x, qkv_weight_tensor, out_linear_weight, self.pre_layer_norm,
             ln1_scale, ln1_bias, ln2_scale, ln2_bias, epsilon, qkv_bias_tensor,
-            out_linear_bias, attn_mask, self.dropout_prob,
+            out_linear_bias, cache_kv, attn_mask, self.dropout_prob,
             self.attn_dropout_prob, ln2_epsilon)
+
+        if self.has_cache_kv:
+            return final_out[0], final_out[1]
+
         paddle.autograd.backward(
             [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
         return final_out, x.grad
@@ -236,114 +280,27 @@ def test_fused_attention_op(self):
 
 class TestFusedAttentionOpBiasIsNone(TestFusedAttentionOp):
     def config(self):
-        self.x_type = np.float32
-        self.attn_mask_type = np.float64
-        self.pre_layer_norm = False
-        self.has_attn_mask = True
-        self.training = True
-
-        self.batch_size = 8
-        self.query_length = 128
-        self.head_dim = 64
-        self.num_heads = 16
-        self.embed_dim = self.head_dim * self.num_heads
-
-        self.dropout_prob = 0.0
-        self.attn_dropout_prob = 0.0
-        self.weight_attr = None
+        super().config()
         self.bias_attr = False
-        self.kdim, self.vdim = self.embed_dim, self.embed_dim
-        self.key_length, self.value_length = self.query_length, self.query_length
-
-    def test_fused_attention_op(self):
-        final_out_ref, x_grad_ref = self.GetBaselineOut()
-        final_out, x_grad = self.GetFusedAttentionOut()
-        np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
-        np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-4)
 
 
 class TestFusedAttentionOpPreLn(TestFusedAttentionOp):
     def config(self):
-        self.x_type = np.float32
-        self.attn_mask_type = np.float64
+        super().config()
         self.pre_layer_norm = True
-        self.has_attn_mask = True
-        self.training = True
-
-        self.batch_size = 8
-        self.query_length = 128
-        self.head_dim = 64
-        self.num_heads = 16
-        self.embed_dim = self.head_dim * self.num_heads
-
-        self.dropout_prob = 0.0
-        self.attn_dropout_prob = 0.0
-        self.weight_attr = None
-        self.bias_attr = None
-        self.kdim, self.vdim = self.embed_dim, self.embed_dim
-        self.key_length, self.value_length = self.query_length, self.query_length
-
-    def test_fused_attention_op(self):
-        final_out_ref, x_grad_ref = self.GetBaselineOut()
-        final_out, x_grad = self.GetFusedAttentionOut()
-        np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
-        np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-4)
 
 
 class TestFusedAttentionOpNoneAttnMask(TestFusedAttentionOp):
     def config(self):
-        self.x_type = np.float32
-        self.attn_mask_type = np.float64
+        super().config()
         self.pre_layer_norm = True
         self.has_attn_mask = False
-        self.training = True
-
-        self.batch_size = 8
-        self.query_length = 128
-        self.head_dim = 64
-        self.num_heads = 16
-        self.embed_dim = self.head_dim * self.num_heads
-
-        self.dropout_prob = 0.0
-        self.attn_dropout_prob = 0.0
-        self.weight_attr = None
-        self.bias_attr = None
-        self.kdim, self.vdim = self.embed_dim, self.embed_dim
-        self.key_length, self.value_length = self.query_length, self.query_length
-
-    def test_fused_attention_op(self):
-        final_out_ref, x_grad_ref = self.GetBaselineOut()
-        final_out, x_grad = self.GetFusedAttentionOut()
-        np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
-        np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-4)
 
 
 class TestFusedAttentionOpFp16(TestFusedAttentionOp):
     def config(self):
+        super().config()
         self.x_type = np.float16
-        self.attn_mask_type = np.float64
-        self.pre_layer_norm = False
-        self.has_attn_mask = True
-        self.training = True
-
-        self.batch_size = 8
-        self.query_length = 128
-        self.head_dim = 64
-        self.num_heads = 16
-        self.embed_dim = self.head_dim * self.num_heads
-
-        self.dropout_prob = 0.0
-        self.attn_dropout_prob = 0.0
-        self.weight_attr = None
-        self.bias_attr = None
-        self.kdim, self.vdim = self.embed_dim, self.embed_dim
-        self.key_length, self.value_length = self.query_length, self.query_length
 
     def test_fused_attention_op(self):
         final_out_ref, x_grad_ref = self.GetBaselineOut()
@@ -354,5 +311,21 @@ def test_fused_attention_op(self):
             x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-1)
 
 
+class TestFusedAttentionOpCacheKV(TestFusedAttentionOp):
+    def config(self):
+        super().config()
+        self.has_cache_kv = True
+        self.training = False
+        self.query_length = 1
+        self.key_length, self.value_length = 1, 1
+
+    def test_fused_attention_op(self):
+        with paddle.no_grad():
+            final_out_ref = self.GetBaselineOut()
+            final_out, cache_kv_out = self.GetFusedAttentionOut()
+            np.testing.assert_allclose(
+                final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
index 1dbc1c056128c..a7331a353afe8 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
@@ -22,10 +22,11 @@
 
 
 class TestGatherNdOpWithEmptyIndex(OpTest):
-    #Index has empty element, which means copy entire tensor
+    # Index has empty element, which means copy entire tensor
 
     def setUp(self):
         self.op_type = "gather_nd"
+        self.python_api = paddle.gather_nd
         xnp = np.random.random((5, 20)).astype("float64")
         self.inputs = {'X': xnp, 'Index': np.array([[], []]).astype("int32")}
         self.outputs = {
@@ -33,24 +34,25 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
 
 
 class TestGatherNdOpWithIndex1(OpTest):
     def setUp(self):
         self.op_type = "gather_nd"
+        self.python_api = paddle.gather_nd
         xnp = np.random.random((5, 20)).astype("float64")
         self.inputs = {'X': xnp, 'Index': np.array([1]).astype("int32")}
         self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
 
 
 class TestGatherNdOpWithLowIndex(OpTest):
@@ -58,6 +60,7 @@ class TestGatherNdOpWithLowIndex(OpTest):
 
     def setUp(self):
         self.op_type = "gather_nd"
+        self.python_api = paddle.gather_nd
         xnp = np.random.uniform(0, 100, (10, 10)).astype("float64")
         index = np.array([[1], [2]]).astype("int64")
 
@@ -66,10 +69,10 @@ def setUp(self):
         self.outputs = {'Out': xnp[tuple(index.T)]}  #[[14, 25, 1], [76, 22, 3]]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
 
 
 class TestGatherNdOpIndex1(OpTest):
@@ -77,18 +80,19 @@ class TestGatherNdOpIndex1(OpTest):
 
     def setUp(self):
         self.op_type = "gather_nd"
+        self.python_api = paddle.gather_nd
         xnp = np.random.uniform(0, 100, (10, 10)).astype("float64")
-        index = np.array([1, 2]).astype("int64")
+        index = np.array([1, 2]).astype("int32")
 
         self.inputs = {'X': xnp, 'Index': index}
 
         self.outputs = {'Out': xnp[tuple(index.T)]}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
 
 
 class TestGatherNdOpWithSameIndexAsX(OpTest):
@@ -96,6 +100,7 @@ class TestGatherNdOpWithSameIndexAsX(OpTest):
 
     def setUp(self):
         self.op_type = "gather_nd"
+        self.python_api = paddle.gather_nd
         xnp = np.random.uniform(0, 100, (10, 10)).astype("float64")
         index = np.array([[1, 1], [2, 1]]).astype("int64")
 
@@ -103,10 +108,10 @@ def setUp(self):
         self.outputs = {'Out': xnp[tuple(index.T)]}  #[25, 22]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
 
 
 class TestGatherNdOpWithHighRankSame(OpTest):
@@ -114,6 +119,7 @@ class TestGatherNdOpWithHighRankSame(OpTest):
 
     def setUp(self):
         self.op_type = "gather_nd"
+        self.python_api = paddle.gather_nd
         shape = (5, 2, 3, 1, 10)
         xnp = np.random.rand(*shape).astype("float64")
         index = np.vstack([np.random.randint(0, s, size=2) for s in shape]).T
@@ -122,10 +128,10 @@ def setUp(self):
         self.outputs = {'Out': xnp[tuple(index.T)]}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
 
 
 class TestGatherNdOpWithHighRankDiff(OpTest):
@@ -133,6 +139,7 @@ class TestGatherNdOpWithHighRankDiff(OpTest):
 
     def setUp(self):
         self.op_type = "gather_nd"
+        self.python_api = paddle.gather_nd
         shape = (2, 3, 4, 1, 10)
         xnp = np.random.rand(*shape).astype("float64")
         index = np.vstack([np.random.randint(0, s, size=200) for s in shape]).T
@@ -142,10 +149,10 @@ def setUp(self):
         self.outputs = {'Out': xnp[tuple(index.T)].reshape([20, 5, 2])}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
 
 
 #Test Python API
@@ -245,4 +252,5 @@ def test_imperative(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_tree_op.py b/python/paddle/fluid/tests/unittests/test_gather_tree_op.py
index 74e2cd9f74144..6fe68c5d34ffa 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_tree_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_tree_op.py
@@ -25,6 +25,7 @@
 class TestGatherTreeOp(OpTest):
     def setUp(self):
         self.op_type = "gather_tree"
+        self.python_api = paddle.nn.functional.gather_tree
         max_length, batch_size, beam_size = 5, 2, 2
         ids = np.random.randint(
             0, high=10, size=(max_length, batch_size, beam_size))
@@ -34,7 +35,7 @@ def setUp(self):
         self.outputs = {'Out': self.backtrace(ids, parents)}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     @staticmethod
     def backtrace(ids, parents):
@@ -126,4 +127,5 @@ def test_type_parents():
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
index cd4ba5b054264..4d5f657d51e0b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,6 +19,9 @@
 import unittest
 from unittest import TestCase
 import numpy as np
+import paddle.compat as cpt
+from paddle.fluid.framework import _test_eager_guard
+import paddle.fluid.core as core
 
 
 def _dygraph_guard_(func):
@@ -40,6 +43,128 @@ def random_var(size, low=-1, high=1, dtype='float32'):
     return fluid.dygraph.to_variable(x_np)
 
 
+class TestEagerGrad(TestCase):
+    def func_simple_example_eager_grad(self):
+        np.random.seed(2021)
+        paddle.set_device('cpu')
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        out = paddle.matmul(x, y)
+        dx = fluid.dygraph.grad(out, x)
+
+        dout = np.ones_like(np_y)
+        expected_dx = np.matmul(dout, np.transpose(np_y))
+
+        # stop_gradient = !create_graph, create_graph default false
+        self.assertEqual(dx[0].stop_gradient, True)
+        self.assertTrue(np.allclose(dx[0].numpy(), expected_dx[0]))
+
+    def test_simple_example_eager_grad(self):
+        with _test_eager_guard():
+            self.func_simple_example_eager_grad()
+        self.func_simple_example_eager_grad()
+
+    def func_simple_example_eager_grad_allow_unused(self):
+        np.random.seed(2021)
+        paddle.set_device('cpu')
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        dx = fluid.dygraph.grad(out, [x, z], allow_unused=True)
+        dout = np.ones_like(np_y)
+        expected_dx = np.matmul(dout, np.transpose(np_y))
+        self.assertTrue(np.allclose(dx[0].numpy(), expected_dx[0]))
+        # stop_gradient = !create_graph, create_graph default false
+        self.assertEqual(dx[0].stop_gradient, True)
+        # x is unused input in the graph
+        self.assertEqual(dx[1], None)
+
+    def test_simple_example_eager_grad_allow_unused(self):
+        with _test_eager_guard():
+            self.func_simple_example_eager_grad_allow_unused()
+        self.func_simple_example_eager_grad_allow_unused()
+
+    def func_simple_example_eager_grad_not_allow_unused(self):
+        np.random.seed(2021)
+        paddle.set_device('cpu')
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        try:
+            # allow_unused is false in default
+            dx = fluid.dygraph.grad(out, [x, z])
+        except ValueError as e:
+            error_msg = cpt.get_exception_message(e)
+            assert error_msg.find("allow_unused") > 0
+
+    def test_simple_example_eager_grad_not_allow_unused(self):
+        with _test_eager_guard():
+            self.func_simple_example_eager_grad_not_allow_unused()
+        self.func_simple_example_eager_grad_not_allow_unused()
+
+    def func_simple_example_eager_grad_duplicate_input(self):
+        np.random.seed(2021)
+        paddle.set_device('cpu')
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        try:
+            # duplicate input will arise RuntimeError errors
+            dx = fluid.dygraph.grad(out, [x, x])
+        except RuntimeError as e:
+            error_msg = cpt.get_exception_message(e)
+            assert error_msg.find("duplicate") > 0
+
+    def test_simple_example_eager_grad_duplicate_input(self):
+        with _test_eager_guard():
+            self.func_simple_example_eager_grad_duplicate_input()
+        self.func_simple_example_eager_grad_duplicate_input()
+
+    def func_simple_example_eager_grad_duplicate_output(self):
+        np.random.seed(2021)
+        paddle.set_device('cpu')
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        try:
+            # duplicate output will arise RuntimeError errors
+            dx = fluid.dygraph.grad([out, out], [x])
+        except RuntimeError as e:
+            error_msg = cpt.get_exception_message(e)
+            assert error_msg.find("duplicate") > 0
+
+    def test_simple_example_eager_grad_duplicate_output(self):
+        with _test_eager_guard():
+            self.func_simple_example_eager_grad_duplicate_output()
+        self.func_simple_example_eager_grad_duplicate_output()
+
+
 class TestDygraphDoubleGrad(TestCase):
     def setUp(self):
         self.sort_sum_gradient = False
@@ -64,7 +189,7 @@ def grad(self,
             allow_unused=allow_unused)
 
     @dygraph_guard
-    def test_exception(self):
+    def func_exception(self):
         with self.assertRaises(AssertionError):
             self.grad(None, None)
 
@@ -93,8 +218,13 @@ def test_exception(self):
         with self.assertRaises(AssertionError):
             self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1)
 
+    def test_exception(self):
+        with _test_eager_guard():
+            self.func_exception()
+        self.func_exception()
+
     @dygraph_guard
-    def test_simple_example(self):
+    def func_simple_example(self):
         x = random_var(self.shape)
         x.stop_gradient = False
         y = x + 1
@@ -123,8 +253,44 @@ def test_simple_example(self):
             self.assertNotEqual(grad_with_none_and_not_none.stop_gradient,
                                 create_graph)
 
+    def test_simple_example(self):
+        with _test_eager_guard():
+            self.func_simple_example()
+        self.func_simple_example()
+
     @dygraph_guard
-    def test_none_one_initial_gradient(self):
+    def func_example_no_grad_vars(self):
+        x = random_var(self.shape)
+        x_np = x.numpy()
+        numel = x_np.size
+        x.stop_gradient = False
+
+        y1 = fluid.layers.relu(x)
+        y2 = fluid.layers.relu(x)
+        z = y1 + y2
+        w = z * z
+
+        w_mean = fluid.layers.reduce_mean(w)
+        del y1, z, w
+
+        dx_actual, = self.grad(
+            [w_mean], [x], create_graph=True, no_grad_vars=[y2])
+
+        self.assertFalse(y2.stop_gradient)
+        self.assertFalse(dx_actual.stop_gradient)
+
+        dx_expected = (1.0 / float(numel) * (np.maximum(x_np, 0) + y2.numpy()) *
+                       (x_np > 0) * 2).astype('float32')
+
+        self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
+
+    def test_example_no_grad_vars(self):
+        with _test_eager_guard():
+            self.func_example_no_grad_vars()
+        self.func_example_no_grad_vars()
+
+    @dygraph_guard
+    def func_none_one_initial_gradient(self):
         numel = 1
         for s in self.shape:
             numel *= s
@@ -190,8 +356,13 @@ def test_none_one_initial_gradient(self):
                             np.array_equal(grad_z.numpy(),
                                            original_random_grad_z))
 
+    def test_none_one_initial_gradient(self):
+        with _test_eager_guard():
+            self.func_none_one_initial_gradient()
+        self.func_none_one_initial_gradient()
+
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_create_graph(self):
+    def func_example_with_gradient_accumulation_and_create_graph(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -214,25 +385,33 @@ def test_example_with_gradient_accumulation_and_create_graph(self):
                        (x_np > 0) * 2).astype('float32')
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward(retain_graph=True)
-
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 / float(numel) *
-                           (x_np + dx_expected *
-                            (x_np > 0) * 2 / float(numel))).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
-
-        for i in range(5):
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
             loss.backward(retain_graph=True)
+
             x_grad_actual = x.gradient()
-            x_grad_expected = (i + 2) * (2.0 / float(numel) * (
+            x_grad_expected = (2.0 / float(numel) * (
                 x_np + dx_expected *
                 (x_np > 0) * 2 / float(numel))).astype('float32')
             self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
 
+            for i in range(5):
+                loss.backward(retain_graph=True)
+                x_grad_actual = x.gradient()
+                x_grad_expected = (i + 2) * (2.0 / float(numel) * (
+                    x_np + dx_expected *
+                    (x_np > 0) * 2 / float(numel))).astype('float32')
+                self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+
+    def test_example_with_gradient_accumulation_and_create_graph(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_create_graph()
+        self.func_example_with_gradient_accumulation_and_create_graph()
+
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_no_grad_vars(self):
+    def func_example_with_gradient_accumulation_and_no_grad_vars(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -256,17 +435,25 @@ def test_example_with_gradient_accumulation_and_no_grad_vars(self):
                        (x_np > 0) * 2).astype('float32')
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward()
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss.backward()
 
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 / float(numel) *
-                           (x_np + dx_expected *
-                            (x_np > 0) * 4 / float(numel))).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+            x_grad_actual = x.gradient()
+            x_grad_expected = (2.0 / float(numel) * (
+                x_np + dx_expected *
+                (x_np > 0) * 4 / float(numel))).astype('float32')
+            self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+
+    def test_example_with_gradient_accumulation_and_no_grad_vars(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_no_grad_vars()
+        self.func_example_with_gradient_accumulation_and_no_grad_vars()
 
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_not_create_graph(self):
+    def func_example_with_gradient_accumulation_and_not_create_graph(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -289,12 +476,20 @@ def test_example_with_gradient_accumulation_and_not_create_graph(self):
 
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward()
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss.backward()
 
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+            x_grad_actual = x.gradient()
+            x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
+            self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+
+    def test_example_with_gradient_accumulation_and_not_create_graph(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_not_create_graph()
+        self.func_example_with_gradient_accumulation_and_not_create_graph()
 
 
 class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
@@ -304,7 +499,7 @@ def setUp(self):
 
 
 class TestDygraphDoubleGradVisitedUniq(TestCase):
-    def test_compare(self):
+    def func_compare(self):
         value = np.random.uniform(-0.5, 0.5, 100).reshape(10, 2,
                                                           5).astype("float32")
 
@@ -349,6 +544,11 @@ def model_f(input):
 
         self.assertTrue(np.array_equal(grad_1, grad_2))
 
+    def test_compare(self):
+        with _test_eager_guard():
+            self.func_compare()
+        self.func_compare()
+
 
 class TestRaiseNoDoubleGradOp(TestCase):
     def raise_no_grad_op(self):
diff --git a/python/paddle/fluid/tests/unittests/test_index_sample_op.py b/python/paddle/fluid/tests/unittests/test_index_sample_op.py
index c1a8299592a2b..4da03c9643fa9 100644
--- a/python/paddle/fluid/tests/unittests/test_index_sample_op.py
+++ b/python/paddle/fluid/tests/unittests/test_index_sample_op.py
@@ -24,6 +24,7 @@
 class TestIndexSampleOp(OpTest):
     def setUp(self):
         self.op_type = "index_sample"
+        self.python_api = paddle.index_sample
         self.config()
         xnp = np.random.random(self.x_shape).astype(self.x_type)
         indexnp = np.random.randint(
@@ -39,10 +40,10 @@ def setUp(self):
         self.outputs = {'Out': out}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
 
     def config(self):
         """
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index bff10c9c4ca26..8dc822c69b2c5 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -1025,4 +1025,5 @@ def test_error(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inner.py b/python/paddle/fluid/tests/unittests/test_inner.py
index de9decd0b8961..ff9f15ebbfc82 100644
--- a/python/paddle/fluid/tests/unittests/test_inner.py
+++ b/python/paddle/fluid/tests/unittests/test_inner.py
@@ -163,4 +163,5 @@ def test_errors(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py b/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py
new file mode 100644
index 0000000000000..a434c56200061
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py
@@ -0,0 +1,397 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+
+
+class TestDygraphInplace(unittest.TestCase):
+    def setUp(self):
+        self.init_data()
+        self.set_np_compare_func()
+
+    def init_data(self):
+        self.input_var_numpy = np.random.uniform(-5, 5, [10, 20, 1])
+        self.dtype = "float32"
+
+    def set_np_compare_func(self):
+        self.np_compare = np.array_equal
+
+    def non_inplace_api_processing(self, var):
+        return paddle.squeeze(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.squeeze_(var)
+
+    def test_inplace_api(self):
+        with _test_eager_guard():
+            var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
+            inplace_var = self.inplace_api_processing(var)
+            self.assertTrue(id(var) == id(inplace_var))
+
+            inplace_var.exp_()
+            self.assertTrue(np.array_equal(var.numpy(), inplace_var.numpy()))
+
+    def test_forward_version(self):
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
+                self.assertEqual(var.inplace_version, 0)
+
+                inplace_var = self.inplace_api_processing(var)
+                self.assertEqual(var.inplace_version, 1)
+
+                inplace_var.exp_()
+                self.assertEqual(var.inplace_version, 2)
+
+                inplace_var = self.inplace_api_processing(inplace_var)
+                self.assertEqual(var.inplace_version, 3)
+
+    def test_leaf_inplace_var_error(self):
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
+                var.stop_gradient = False
+
+                def leaf_inplace_error():
+                    self.inplace_api_processing(var)
+
+                self.assertRaises(ValueError, leaf_inplace_error)
+
+    def test_backward_error(self):
+        # It raises an error because the inplace operator will result
+        # in incorrect gradient computation.
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+
+                # Here, the gradient computation will use the value of var_b
+                var_c = var_b**2
+                self.inplace_api_processing(var_b)
+
+                loss = paddle.nn.functional.relu(var_c)
+                with self.assertRaisesRegexp(
+                        RuntimeError,
+                        "received current_inplace_version:{} != inplace_version_snapshot_:{}".
+                        format(1, 0)):
+                    loss.backward()
+
+    def test_backward_success_1(self):
+        # var_b is modified inplace before using it, the inplace operator doesn't result
+        # in incorrect gradient computation.
+        grad_var_a, grad_var_a_inplace = 0, 1
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+                var_c = self.inplace_api_processing(
+                    var_b)  # var_b is modified inplace before using it
+
+                # Here, the gradient computation will use the value of var_b
+                var_d = var_c**2
+                loss = var_d.sum()
+                loss.backward()
+                grad_var_a_inplace = var_a.grad.numpy()
+
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+                var_c = self.non_inplace_api_processing(var_b)
+                var_d = var_c**2
+                loss = var_d.sum()
+                loss.backward()
+                grad_var_a = var_a.grad.numpy()
+
+        self.assertTrue(self.np_compare(grad_var_a_inplace, grad_var_a))
+
+    def test_backward_success_2(self):
+        # Although var_b is modified inplace after using it, it does not used in gradient computation.
+        # The inplace operator doesn't result in incorrect gradient computation.
+        grad_var_a, grad_var_a_inplace = 0, 1
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+
+                var_c = self.inplace_api_processing(
+                    var_b)  # var_b is modified inplace before using it
+
+                var_d = var_c + var_c  # Here, the grad op of sum doesn't use the value of var_b
+                loss = var_d.sum()
+
+                loss.backward()
+                grad_var_a_inplace = var_a.grad.numpy()
+
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+
+                var_c = self.non_inplace_api_processing(
+                    var_b)  # var_b is modified inplace before using it
+
+                var_d = var_c + var_c  # Here, the grad op of sum doesn't use the value of var_b
+                loss = var_d.sum()
+
+                loss.backward()
+                grad_var_a = var_a.grad.numpy()
+        self.assertTrue(np.array_equal(grad_var_a_inplace, grad_var_a))
+
+
+class TestDygraphInplaceUnsqueeze(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.unsqueeze(var, -1)
+
+    def inplace_api_processing(self, var):
+        return paddle.unsqueeze_(var, -1)
+
+
+class TestDygraphInplaceReshape(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.reshape(var, [-1])
+
+    def inplace_api_processing(self, var):
+        return paddle.reshape_(var, [-1])
+
+
+class TestDygraphInplaceFlatten(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.flatten()
+
+    def inplace_api_processing(self, var):
+        return var.flatten_()
+
+
+class TestDygraphInplaceScatter(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.array([[1, 1], [2, 2], [3, 3]])
+        self.dtype = "float32"
+
+    def non_inplace_api_processing(self, var):
+        index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
+        updates = paddle.to_tensor(
+            [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
+
+        return paddle.scatter(var, index, updates, overwrite=False)
+
+    def inplace_api_processing(self, var):
+        index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
+        updates = paddle.to_tensor(
+            [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
+
+        return paddle.scatter_(var, index, updates, overwrite=False)
+
+
+class TestDygraphInplaceElu(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.nn.functional.elu(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.nn.functional.elu_(var)
+
+
+class TestDygraphInplaceRelu(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.nn.functional.relu(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.nn.functional.relu_(var)
+
+
+class TestDygraphInplaceSoftmax(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.nn.functional.softmax(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.nn.functional.softmax_(var)
+
+
+class TestDygraphInplaceTanh(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.tanh(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.tanh_(var)
+
+
+class TestDygraphInplaceCeil(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.ceil()
+
+    def inplace_api_processing(self, var):
+        return var.ceil_()
+
+
+class TestDygraphInplaceFloor(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.floor()
+
+    def inplace_api_processing(self, var):
+        return var.floor_()
+
+
+class TestDygraphInplaceExp(TestDygraphInplace):
+    def set_np_compare_func(self):
+        self.np_compare = np.allclose
+
+    def non_inplace_api_processing(self, var):
+        return var.exp()
+
+    def inplace_api_processing(self, var):
+        return var.exp_()
+
+
+class TestDygraphInplaceReciprocal(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.reciprocal()
+
+    def inplace_api_processing(self, var):
+        return var.reciprocal_()
+
+
+class TestDygraphInplaceRound(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.round()
+
+    def inplace_api_processing(self, var):
+        return var.round_()
+
+
+class TestDygraphInplaceSqrt(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.random.uniform(0, 5, [10, 20, 1])
+        self.dtype = "float32"
+
+    def non_inplace_api_processing(self, var):
+        return var.sqrt()
+
+    def inplace_api_processing(self, var):
+        return var.sqrt_()
+
+
+class TestDygraphInplaceRsqrt(TestDygraphInplaceSqrt):
+    def non_inplace_api_processing(self, var):
+        return var.rsqrt()
+
+    def inplace_api_processing(self, var):
+        return var.rsqrt_()
+
+
+class TestDygraphInplaceClip(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.clip(0.6, 1.5)
+
+    def inplace_api_processing(self, var):
+        return var.clip_(0.6, 1.5)
+
+
+class TestDygraphInplaceScale(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.scale(scale=2.0, bias=3.0)
+
+    def inplace_api_processing(self, var):
+        return var.scale_(scale=2.0, bias=3.0)
+
+
+class TestDygraphInplaceAdd(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.random.rand(2, 3, 4)
+        self.dtype = "float32"
+        self.input_var_numpy_2 = np.random.rand(2, 3, 4).astype(self.dtype)
+
+    def non_inplace_api_processing(self, var):
+        input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
+        return var.add(input_var_2)
+
+    def inplace_api_processing(self, var):
+        input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
+        return var.add_(input_var_2)
+
+
+class TestDygraphInplaceSubtract(TestDygraphInplaceAdd):
+    def non_inplace_api_processing(self, var):
+        input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
+        return var.subtract(input_var_2)
+
+    def inplace_api_processing(self, var):
+        input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
+        return var.subtract_(input_var_2)
+
+
+class TestLossIsInplaceVar(unittest.TestCase):
+    def test_loss_is_inplace_var(self):
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.ones((2, 2))
+                var_a.stop_gradient = False
+
+                var_b = var_a * 2
+                loss = var_b.tanh_()
+
+                loss.backward()
+                inplace_grad_var_a = var_a.grad.numpy()
+
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.ones((2, 2))
+                var_a.stop_gradient = False
+
+                var_b = var_a * 2
+                loss = var_b.tanh()
+
+                loss.backward()
+                grad_var_a = var_a.grad.numpy()
+
+        self.assertTrue(np.array_equal(inplace_grad_var_a, grad_var_a))
+
+
+class TestContinuouslyInplace(unittest.TestCase):
+    def test_continuously_inplace(self):
+        with _test_eager_guard():
+            a = paddle.rand([2, 3])
+            a.stop_gradient = False
+            b = a * 2
+
+            b.reshape_([-1])
+            b.reshape_([2, 3])
+            b.reshape_([-1])
+
+            b.backward()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_io_save_load.py b/python/paddle/fluid/tests/unittests/test_io_save_load.py
index 89ca28510b9b9..83aadbf68d569 100644
--- a/python/paddle/fluid/tests/unittests/test_io_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_io_save_load.py
@@ -88,4 +88,5 @@ def test_when_train_with_no_grad(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_isclose_op.py b/python/paddle/fluid/tests/unittests/test_isclose_op.py
index aa39284d11349..2bb58d7c5741f 100644
--- a/python/paddle/fluid/tests/unittests/test_isclose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_isclose_op.py
@@ -210,6 +210,9 @@ def set_args(self):
         self.atol = np.array([0]).astype("float64")
         self.equal_nan = False
 
+    def test_check_output(self):
+        self.check_output()
+
 
 class TestIscloseOpLargeDimInput(TestIscloseOp):
     def set_args(self):
@@ -222,4 +225,5 @@ def set_args(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index ca9a489c7496f..b75dc2c964ca0 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -215,6 +215,8 @@ def test_with_place(place,
                                   for name in ['x', 'scale', 'bias', 'y@GRAD']
                               },
                               fetch_list=fetch_list)
+                # print(y)
+                # print(out[0])
                 self.__assert_close(y, out[0], "y")
                 self.__assert_close(mean, out[1], "mean")
                 self.__assert_close(variance, out[2], "variance", 1e-3)
@@ -238,6 +240,7 @@ def test_with_place(place,
 
     def test_check_forward_backward_with_scale_and_bias(self):
         self.check_forward_backward(shape=[1, 3, 4, 5], begin_norm_axis=1)
+
         self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
         self.check_forward_backward(
             shape=[2, 3, 4, 5],
@@ -432,4 +435,5 @@ def test_main(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_log_softmax.py b/python/paddle/fluid/tests/unittests/test_log_softmax.py
index 16f954708d4d4..423eeaf3ada45 100644
--- a/python/paddle/fluid/tests/unittests/test_log_softmax.py
+++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py
@@ -175,4 +175,5 @@ def test_errors(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
index 6d94144fc7788..60dd4948f996e 100644
--- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
@@ -555,4 +555,5 @@ def test_linear_warmp(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index d0a40f38ba257..65d0e289f8132 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -542,7 +542,7 @@ def test_check_grad_normal(self):
             'Out',
             user_defined_grads=[self.grad_x, self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=False)
+            check_eager=True)
 
     def test_check_grad_ingore_x(self):
         self.check_grad(
@@ -560,7 +560,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             user_defined_grads=[self.grad_x],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=False)
+            check_eager=True)
 
 
 class TestComplexMatMulOpBroadcast(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_mean_iou.py b/python/paddle/fluid/tests/unittests/test_mean_iou.py
index e2e118ac9e3b4..4e89a9034a341 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_iou.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py
@@ -19,6 +19,7 @@
 import numpy as np
 from op_test import OpTest
 import paddle.fluid as fluid
+import paddle
 
 
 def compute_mean_iou(predictions, labels, num_classes, in_wrongs, in_corrects,
@@ -129,4 +130,5 @@ def test_errors(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_minus_op.py b/python/paddle/fluid/tests/unittests/test_minus_op.py
index 54253b17b9678..461ff6a9273cd 100644
--- a/python/paddle/fluid/tests/unittests/test_minus_op.py
+++ b/python/paddle/fluid/tests/unittests/test_minus_op.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 
 
 class TestMinusOp(OpTest):
@@ -36,4 +37,5 @@ def test_check_grad(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mv_op.py b/python/paddle/fluid/tests/unittests/test_mv_op.py
index e0d23e7871fb2..09ec702671bc9 100644
--- a/python/paddle/fluid/tests/unittests/test_mv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mv_op.py
@@ -27,15 +27,16 @@
 class TestMVOp(OpTest):
     def setUp(self):
         self.op_type = "mv"
+        self.python_api = paddle.mv
         self.init_config()
         self.inputs = {'X': self.x, 'Vec': self.vec}
         self.outputs = {'Out': np.dot(self.x, self.vec)}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X', 'Vec'], 'Out')
+        self.check_grad(['X', 'Vec'], 'Out', check_eager=True)
 
     def init_config(self):
         self.x = np.random.random((2, 100)).astype("float64")
@@ -107,4 +108,5 @@ def test_shape():
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py
index 575bc653618a5..ef912699455d1 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
@@ -588,4 +588,5 @@ def err_dtype(p, shape_x, xdtype, out=None):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_number_count_op.py b/python/paddle/fluid/tests/unittests/test_number_count_op.py
new file mode 100644
index 0000000000000..0df9d2a3a41b4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_number_count_op.py
@@ -0,0 +1,80 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import op_test
+import numpy as np
+import unittest
+import paddle
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.backward import append_backward
+from paddle.distributed.models.moe import utils
+
+
+def count(x, upper_range):
+    res = np.zeros((upper_range, )).astype(int)
+    for i in x.reshape(-1):
+        if i >= 0 and i < len(res):
+            res[i] += 1
+    return res
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestExpertCountOpInt64(op_test.OpTest):
+    def setUp(self):
+        expert_num = 16
+        self.op_type = "number_count"
+        x = np.random.randint(-1, expert_num, size=(1000, 2)).astype('int64')
+        self.inputs = {'gate_idx': x}
+        self.outputs = {'Out': count(x, expert_num)}
+        self.attrs = {"upper_range": expert_num}
+
+    def test_forward(self):
+        self.check_output_with_place(paddle.CUDAPlace(0))
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestExpertCountAPI(unittest.TestCase):
+    def setUp(self):
+        self.upper_range = 320
+        self.x = np.random.randint(
+            -1, self.upper_range, size=(6000, 200)).astype('int64')
+        self.out = count(self.x, self.upper_range)
+        self.place = paddle.CUDAPlace(0)
+
+    def test_api_static(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data('x', self.x.shape, dtype="int64")
+            out = utils._number_count(x, self.upper_range)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'x': self.x}, fetch_list=[out])
+            assert np.allclose(res, self.out)
+
+    def test_api_dygraph(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x)
+        out = utils._number_count(x, self.upper_range)
+        assert np.allclose(out.numpy(), self.out)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
index 66de1b309797f..fac258192112d 100644
--- a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
@@ -22,7 +22,8 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
-from paddle.fluid.framework import Program, program_guard
+from paddle.framework import _in_eager_mode
+from paddle.fluid.framework import Program, program_guard, _test_eager_guard
 
 
 class TestOneHotOp(OpTest):
@@ -45,7 +46,7 @@ def setUp(self):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
 
 class TestOneHotOp_attr(OpTest):
@@ -68,7 +69,7 @@ def setUp(self):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
 
 class TestOneHotOp_default_dtype(OpTest):
@@ -91,7 +92,7 @@ def setUp(self):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
 
 class TestOneHotOp_default_dtype_attr(OpTest):
@@ -114,7 +115,7 @@ def setUp(self):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
 
 class TestOneHotOp_out_of_range(OpTest):
@@ -132,7 +133,7 @@ def setUp(self):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
 
 class TestOneHotOp_exception(unittest.TestCase):
@@ -190,6 +191,12 @@ def test_api_with_dygraph(self):
             one_hot_label = fluid.one_hot(
                 input=fluid.dygraph.to_variable(label), depth=depth)
 
+            one_hot_label = paddle.nn.functional.one_hot(
+                fluid.dygraph.to_variable(label), depth)
+            with _test_eager_guard():
+                one_hot_label = paddle.nn.functional.one_hot(
+                    paddle.to_tensor(label), depth)
+
     def _run(self, depth):
         label = fluid.layers.data(name="label", shape=[1], dtype="int64")
         one_hot_label = fluid.one_hot(input=label, depth=depth)
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
index 2ffe523ef6dda..531e9663a2b72 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 from unittest import TestCase
 import numpy as np
 import paddle
+from paddle.fluid.framework import _test_eager_guard
+import paddle.fluid.core as core
 
 
 def _dygraph_guard_(func):
@@ -62,7 +64,7 @@ def grad(self,
             allow_unused=allow_unused)
 
     @dygraph_guard
-    def test_exception(self):
+    def func_exception(self):
         with self.assertRaises(AssertionError):
             self.grad(None, None)
 
@@ -91,8 +93,13 @@ def test_exception(self):
         with self.assertRaises(AssertionError):
             self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1)
 
+    def test_exception(self):
+        with _test_eager_guard():
+            self.func_exception()
+        self.func_exception()
+
     @dygraph_guard
-    def test_simple_example(self):
+    def func_simple_example(self):
         x = random_var(self.shape)
         x.stop_gradient = False
         y = x + 1
@@ -121,8 +128,13 @@ def test_simple_example(self):
             self.assertNotEqual(grad_with_none_and_not_none.stop_gradient,
                                 create_graph)
 
+    def test_simple_example(self):
+        with _test_eager_guard():
+            self.func_simple_example()
+        self.func_simple_example()
+
     @dygraph_guard
-    def test_none_one_initial_gradient(self):
+    def func_none_one_initial_gradient(self):
         numel = 1
         for s in self.shape:
             numel *= s
@@ -188,8 +200,13 @@ def test_none_one_initial_gradient(self):
                             np.array_equal(grad_z.numpy(),
                                            original_random_grad_z))
 
+    def test_none_one_initial_gradient(self):
+        with _test_eager_guard():
+            self.func_none_one_initial_gradient()
+        self.func_none_one_initial_gradient()
+
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_create_graph(self):
+    def func_example_with_gradient_accumulation_and_create_graph(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -212,17 +229,25 @@ def test_example_with_gradient_accumulation_and_create_graph(self):
                        (x_np > 0) * 2).astype('float32')
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward()
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss.backward()
 
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 / float(numel) *
-                           (x_np + dx_expected *
-                            (x_np > 0) * 2 / float(numel))).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+            x_grad_actual = x.gradient()
+            x_grad_expected = (2.0 / float(numel) * (
+                x_np + dx_expected *
+                (x_np > 0) * 2 / float(numel))).astype('float32')
+            self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+
+    def test_example_with_gradient_accumulation_and_create_graph(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_create_graph()
+        self.func_example_with_gradient_accumulation_and_create_graph()
 
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_no_grad_vars(self):
+    def func_example_with_gradient_accumulation_and_no_grad_vars(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -246,17 +271,25 @@ def test_example_with_gradient_accumulation_and_no_grad_vars(self):
                        (x_np > 0) * 2).astype('float32')
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward()
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss.backward()
+
+            x_grad_actual = x.gradient()
+            x_grad_expected = (2.0 / float(numel) * (
+                x_np + dx_expected *
+                (x_np > 0) * 4 / float(numel))).astype('float32')
+            self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
 
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 / float(numel) *
-                           (x_np + dx_expected *
-                            (x_np > 0) * 4 / float(numel))).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+    def test_example_with_gradient_accumulation_and_no_grad_vars(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_no_grad_vars()
+        self.func_example_with_gradient_accumulation_and_no_grad_vars()
 
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_not_create_graph(self):
+    def func_example_with_gradient_accumulation_and_not_create_graph(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -279,12 +312,20 @@ def test_example_with_gradient_accumulation_and_not_create_graph(self):
 
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward()
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss.backward()
 
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+            x_grad_actual = x.gradient()
+            x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
+            self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+
+    def test_example_with_gradient_accumulation_and_not_create_graph(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_not_create_graph()
+        self.func_example_with_gradient_accumulation_and_not_create_graph()
 
 
 class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py b/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py
new file mode 100644
index 0000000000000..1e31356a6bc81
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import gc
+import sys
+import unittest
+import time
+import paddle
+import paddle.incubate.multiprocessing as mp
+
+REPEAT = 20
+HAS_SHM_FILES = os.path.isdir('/dev/shm')
+
+
+def fill_tensor(queue, event):
+    data = queue.get()
+    with paddle.no_grad():
+        data[0][:] = 5
+        data[1][:] = 5
+
+    event.set()
+
+
+def send_tensor(queue, event, device, dtype):
+    tensor = paddle.ones([5, 5], dtype=dtype)
+    queue.put(tensor)
+    queue.put(tensor)
+    event.wait()
+
+
+def send_parambase(queue, event, device, dtype):
+    tensor = paddle.nn.Layer().create_parameter(
+        [5, 5],
+        dtype=dtype,
+        default_initializer=paddle.nn.initializer.Constant(value=1.0))
+    queue.put(tensor)
+    queue.put(tensor)
+    event.wait()
+
+
+class leak_checker(object):
+    def __init__(self, test_case):
+        self.checked_pids = [os.getpid()]
+        self.test_case = test_case
+
+    def __enter__(self):
+        self.next_fds = self._get_next_fds(10)
+        return self
+
+    def __exit__(self, *args):
+        if args[0] is None:
+            self.test_case.assertFalse(self.has_shm_files())
+        return False
+
+    def check_pid(self, pid):
+        self.checked_pids.append(pid)
+
+    def _get_next_fds(self, n=1):
+        fds = [os.dup(0) for i in range(n)]
+        for fd in fds:
+            os.close(fd)
+        return fds
+
+    def has_shm_files(self, wait=True):
+        if not HAS_SHM_FILES:
+            return False
+        result = self._has_shm_files()
+        if result and wait:
+            time.sleep(0.5)
+            return self._has_shm_files()
+        return result
+
+    def _has_shm_files(self):
+        gc.collect()
+        names = ['paddle_' + str(pid) for pid in self.checked_pids]
+        for filename in os.listdir('/dev/shm'):
+            for name in names:
+                if filename.startswith(name):
+                    print("have", filename)
+                    return True
+        return False
+
+
+class TestMultiprocessingBase(unittest.TestCase):
+    def get_tensor(self, device="cpu"):
+        self.device = device.lower()
+        place = None
+        tensor = paddle.zeros([5, 5], dtype="float32")
+        return tensor
+
+    def get_parameter(self):
+        w = paddle.nn.Layer().create_parameter(
+            [10, 10],
+            default_initializer=paddle.nn.initializer.Constant(value=0.0))
+        return w
+
+    def _test_empty(self, dtype="float32"):
+        q = mp.Queue()
+        empty = paddle.to_tensor([], dtype=dtype)
+        q.put(empty)
+        out = q.get(timeout=1)
+        self.assertEqual(str(out), str(empty))
+
+    def _test_sharing(self,
+                      ctx=mp,
+                      device='cpu',
+                      dtype="float32",
+                      repeat=1,
+                      param=False):
+        def test_fill():
+            if param:
+                x = self.get_parameter()
+                y = (x[:, 1]).detach()
+            else:
+                x = self.get_tensor()
+                y = x[:, 1]
+
+            data = [x, y]
+
+            queue = ctx.Queue()
+            event = ctx.Event()
+            queue.put(data)
+
+            process = ctx.Process(target=fill_tensor, args=(queue, event))
+            process.daemon = True
+            lc.check_pid(process.pid)
+            process.start()
+
+            event.wait(30)
+
+            self.assertTrue(event.is_set())
+            self.assertTrue(data[0].equal(5).all())
+            self.assertTrue(data[1].equal(5).all())
+
+            process.join(1 if device != "gpu" else 10)
+            self.assertFalse(process.is_alive())
+
+        def test_receive():
+            queue = ctx.Queue()
+            event = ctx.Event()
+
+            process = ctx.Process(
+                target=send_parambase if param else send_tensor,
+                args=(queue, event, device, dtype))
+            process.daemon = True
+            lc.check_pid(process.pid)
+            process.start()
+
+            t1 = queue.get()
+            t2 = queue.get()
+            self.assertTrue(t1.equal(1).all())
+            del t1, t2
+
+            event.set()
+            process.join(1 if device != "gpu" else 10)
+            self.assertFalse(process.is_alive())
+
+        with leak_checker(self) as lc:
+            for _ in range(repeat):
+                test_fill()
+                test_receive()
+
+
+class TestMultiprocessingCpu(TestMultiprocessingBase):
+    def test_pass_tensor(self):
+        paddle.set_device("cpu")
+        self._test_sharing(repeat=REPEAT)
+
+    def test_pass_parambase(self):
+        paddle.set_device("cpu")
+        self._test_sharing(repeat=1, param=True)
+
+    def test_pass_empty(self):
+        paddle.set_device("cpu")
+        self._test_empty()
+
+
+class TestMultiprocessingGpu(TestMultiprocessingBase):
+    @unittest.skipIf(not paddle.fluid.core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    def test_pass_tensor(self):
+        paddle.set_device("gpu")
+        self._test_sharing(mp.get_context("spawn"), "gpu")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
index 9e0cf6ddef2d6..8945d35c131fd 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -315,7 +315,9 @@ def test_single_pickle_var_dygraph(self):
         paddle.save(tensor, path)
         t_dygraph = paddle.load(path)
         np_dygraph = paddle.load(path, return_numpy=True)
-        self.assertTrue(isinstance(t_dygraph, paddle.fluid.core.VarBase))
+        self.assertTrue(
+            isinstance(t_dygraph, (paddle.fluid.core.VarBase,
+                                   paddle.fluid.core.eager.Tensor)))
         self.assertTrue(np.array_equal(tensor.numpy(), np_dygraph))
         self.assertTrue(np.array_equal(tensor.numpy(), t_dygraph.numpy()))
         paddle.enable_static()
@@ -685,27 +687,34 @@ def test_save_load_complex_object_static_save(self):
                         np.array(v), np.array(load_tensor2['k2'][k])))
             self.assertTrue(load_tensor2['epoch'] == 123)
 
-            self.assertTrue(isinstance(load_tensor3[0], fluid.core.VarBase))
+            self.assertTrue(
+                isinstance(load_tensor3[0], (fluid.core.VarBase,
+                                             fluid.core.eager.Tensor)))
             self.assertTrue(np.array_equal(load_tensor3[0].numpy(), obj3[0]))
-            self.assertTrue(isinstance(load_tensor3[1], fluid.core.VarBase))
+            self.assertTrue(
+                isinstance(load_tensor3[1], (fluid.core.VarBase,
+                                             fluid.core.eager.Tensor)))
             self.assertTrue(np.array_equal(load_tensor3[1].numpy(), obj3[1]))
 
             for k, v in state_dict.items():
                 self.assertTrue(
-                    isinstance(load_tensor3[2]["state_dict"][k],
-                               fluid.core.VarBase))
+                    isinstance(load_tensor3[2]["state_dict"][k], (
+                        fluid.core.VarBase, fluid.core.eager.Tensor)))
                 self.assertTrue(
                     np.array_equal(load_tensor3[2]["state_dict"][k].numpy(),
                                    np.array(v)))
 
             for k, v in state_dict.items():
                 self.assertTrue(
-                    isinstance(load_tensor3[2]["opt"][k], fluid.core.VarBase))
+                    isinstance(load_tensor3[2]["opt"][k], (
+                        fluid.core.VarBase, fluid.core.eager.Tensor)))
                 self.assertTrue(
                     np.array_equal(load_tensor3[2]["opt"][k].numpy(),
                                    np.array(v)))
 
-            self.assertTrue(isinstance(load_tensor4[0], fluid.core.VarBase))
+            self.assertTrue(
+                isinstance(load_tensor4[0], (fluid.core.VarBase,
+                                             fluid.core.eager.Tensor)))
             self.assertTrue(np.array_equal(load_tensor4[0].numpy(), obj4[0]))
 
             load_array1 = paddle.load(path1, return_numpy=True)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
index edf9aed04f5e0..2530fc07753e8 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
@@ -200,5 +200,15 @@ def test_parallel_dygraph_dataparallel_with_pylayer(self):
         self.run_mnist_2gpu('parallel_dygraph_dataparallel_with_pylayer.py')
 
 
+class TestDataParallelInEagerMode(TestMultipleGpus):
+    def test_multiple_gpus_dynamic(self):
+        self.run_mnist_2gpu('parallel_dygraph_dataparallel_in_eager_mode.py')
+
+
+class TestGradientCheckInEagerMode(TestMultipleGpus):
+    def test_multiple_gpus_dynamic(self):
+        self.run_mnist_2gpu('parallel_dygraph_gradient_check_in_eager_mode.py')
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_randperm_op.py b/python/paddle/fluid/tests/unittests/test_randperm_op.py
index 4361a45f1568f..2380ccb14aaee 100644
--- a/python/paddle/fluid/tests/unittests/test_randperm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randperm_op.py
@@ -18,6 +18,7 @@
 import paddle
 import paddle.fluid.core as core
 from paddle.static import program_guard, Program
+import os
 
 
 def check_randperm_out(n, data_np):
@@ -129,5 +130,81 @@ def test_out(self):
         paddle.enable_static()
 
 
+class TestRandomValue(unittest.TestCase):
+    def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
+        print("Test Fixed Random number on GPU------>")
+        paddle.disable_static()
+        paddle.set_device('gpu')
+        paddle.seed(2021)
+
+        x = paddle.randperm(30000, dtype='int32').numpy()
+        expect = [
+            24562, 8409, 9379, 10328, 20503, 18059, 9681, 21883, 11783, 27413
+        ]
+        self.assertTrue(np.array_equal(x[0:10], expect))
+        expect = [
+            29477, 27100, 9643, 16637, 8605, 16892, 27767, 2724, 1612, 13096
+        ]
+        self.assertTrue(np.array_equal(x[10000:10010], expect))
+        expect = [
+            298, 4104, 16479, 22714, 28684, 7510, 14667, 9950, 15940, 28343
+        ]
+        self.assertTrue(np.array_equal(x[20000:20010], expect))
+
+        x = paddle.randperm(30000, dtype='int64').numpy()
+        expect = [
+            6587, 1909, 5525, 23001, 6488, 14981, 14355, 3083, 29561, 8171
+        ]
+        self.assertTrue(np.array_equal(x[0:10], expect))
+        expect = [
+            23460, 12394, 22501, 5427, 20185, 9100, 5127, 1651, 25806, 4818
+        ]
+        self.assertTrue(np.array_equal(x[10000:10010], expect))
+        expect = [5829, 4508, 16193, 24836, 8526, 242, 9984, 9243, 1977, 11839]
+        self.assertTrue(np.array_equal(x[20000:20010], expect))
+
+        x = paddle.randperm(30000, dtype='float32').numpy()
+        expect = [
+            5154., 10537., 14362., 29843., 27185., 28399., 27561., 4144.,
+            22906., 10705.
+        ]
+        self.assertTrue(np.array_equal(x[0:10], expect))
+        expect = [
+            1958., 18414., 20090., 21910., 22746., 27346., 22347., 3002., 4564.,
+            26991.
+        ]
+        self.assertTrue(np.array_equal(x[10000:10010], expect))
+        expect = [
+            25580., 12606., 553., 16387., 29536., 4241., 20946., 16899., 16339.,
+            4662.
+        ]
+        self.assertTrue(np.array_equal(x[20000:20010], expect))
+
+        x = paddle.randperm(30000, dtype='float64').numpy()
+        expect = [
+            19051., 2449., 21940., 11121., 282., 7330., 13747., 24321., 21147.,
+            9163.
+        ]
+        self.assertTrue(np.array_equal(x[0:10], expect))
+        expect = [
+            15483., 1315., 5723., 20954., 13251., 25539., 5074., 1823., 14945.,
+            17624.
+        ]
+        self.assertTrue(np.array_equal(x[10000:10010], expect))
+        expect = [
+            10516., 2552., 29970., 5941., 986., 8007., 24805., 26753., 12202.,
+            21404.
+        ]
+        self.assertTrue(np.array_equal(x[20000:20010], expect))
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_renorm_op.py b/python/paddle/fluid/tests/unittests/test_renorm_op.py
index 3ea2002a9786f..e00a892cf7197 100644
--- a/python/paddle/fluid/tests/unittests/test_renorm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_renorm_op.py
@@ -54,7 +54,7 @@ def test_renorm_api(self):
     def test_dygraph_api(self):
         self.input_data()
         # case axis none
-        with fluid.dygraph.guard():
+        with fluid.dygraph.guard(fluid.CPUPlace()):
             input = [[[2.0, 2, -2], [3, 0.3, 3]], [[2, -8, 2], [3.1, 3.7, 3]]]
             x = paddle.to_tensor(input, stop_gradient=False)
             y = paddle.renorm(x, 1.0, 2, 2.05)
@@ -94,4 +94,5 @@ def test_dygraph_api(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py b/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py
index ca324b4a8fd05..1bfc1b00aa822 100644
--- a/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py
+++ b/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py
@@ -23,6 +23,7 @@
 from test_multiclass_nms_op import nms
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
+import paddle
 
 
 def multiclass_nms(prediction, class_num, keep_top_k, nms_threshold):
@@ -518,4 +519,5 @@ def test_iminfo_tensor_dtype():
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_run.py b/python/paddle/fluid/tests/unittests/test_run.py
new file mode 100644
index 0000000000000..498aecf7c6e75
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_run.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import subprocess
+import sys, os
+import json
+import shutil
+
+import random
+
+from os import listdir
+from os.path import isfile, join
+
+pyname = 'train.py'
+colpyfile = '''# train.py for unitest
+import os
+env = os.environ.copy()
+assert "PADDLE_MASTER" in env
+assert "PADDLE_GLOBAL_SIZE" in env
+assert "PADDLE_LOCAL_SIZE" in env
+assert "PADDLE_GLOBAL_RANK" in env
+assert "PADDLE_LOCAL_RANK" in env
+'''
+
+pspyfile = '''# train.py for unitest
+import os
+env = os.environ.copy()
+assert "PADDLE_PSERVERS_IP_PORT_LIST" in env
+assert "PADDLE_TRAINER_ENDPOINTS" in env
+assert "PADDLE_ROLE" in env
+#assert "PADDLE_RANK" in env
+'''
+
+
+def write_file(name, ct):
+    with open(name, "w") as f:
+        f.write(ct)
+
+
+def get_files(pth, prefix):
+    return [
+        f for f in listdir(pth) if isfile(join(pth, f)) and f.startswith(prefix)
+    ]
+
+
+class Collective_Test(unittest.TestCase):
+    def setUp(self):
+        write_file(pyname, colpyfile)
+
+    def pdrun(self, args, env=None):
+        cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.launch"]
+        if args:
+            cmd.extend(args.split(" "))
+        cmd.extend([pyname])
+        proc = subprocess.Popen(cmd, env)
+        return proc
+
+    def test_collective_1(self):
+        args = "--job_id test1"
+        p = self.pdrun(args)
+        p.wait()
+        self.assertTrue(p.poll() == 0)
+
+    def test_collective_2(self):
+        if os.path.exists('./log'):
+            shutil.rmtree('./log')
+
+        args = "--job_id test2 --devices 0,1,2"
+        p = self.pdrun(args)
+        p.wait()
+        self.assertTrue(p.poll() == 0)
+
+        c = get_files('log', 'test2')
+        self.assertTrue(len(c) == 4)
+
+    def test_collective_3(self):
+        if os.path.exists('./log'):
+            shutil.rmtree('./log')
+
+        port = random.randrange(6000, 8000)
+        args = "--job_id test3 --devices 0,1 --master 127.0.0.1:{} --np 2".format(
+            port)
+        p1 = self.pdrun(args)
+        p2 = self.pdrun(args)
+        p1.wait()
+        p2.wait()
+        self.assertTrue(p1.poll() == 0)
+        self.assertTrue(p2.poll() == 0)
+
+        c = get_files('log', 'test3')
+        self.assertTrue(len(c) == 6)
+
+
+class PS_Test(unittest.TestCase):
+    def setUp(self):
+        write_file(pyname, pspyfile)
+
+    def pdrun(self, args, env=None):
+        cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.launch"]
+        if args:
+            cmd.extend(args.split(" "))
+        cmd.extend([pyname])
+        proc = subprocess.Popen(cmd, env)
+        return proc
+
+    def test_ps_1(self):
+        args = "--mode ps"
+        p = self.pdrun(args)
+        p.wait()
+        self.assertTrue(p.poll() == 0)
+
+    def test_ps_2(self):
+        if os.path.exists('./log'):
+            shutil.rmtree('./log')
+
+        args = "--job_id ps2 --server_num=2 --trainer_num=2"
+        p = self.pdrun(args)
+        p.wait()
+        self.assertTrue(p.poll() == 0)
+
+        c = get_files('log', 'ps2')
+        self.assertTrue(len(c) == 5)
+
+    def test_ps_3(self):
+        if os.path.exists('./log'):
+            shutil.rmtree('./log')
+
+        port = random.randrange(6000, 8000)
+        args = "--job_id ps3 --master 127.0.0.1:{} --np 2 --server_num=1 --trainer_num=1".format(
+            port)
+        p1 = self.pdrun(args)
+        p2 = self.pdrun(args)
+        p1.wait()
+        p2.wait()
+        self.assertTrue(p1.poll() == 0)
+        self.assertTrue(p2.poll() == 0)
+
+        c = get_files('log', 'ps3')
+        self.assertTrue(len(c) == 6)
+
+    def test_ps_4(self):
+        if os.path.exists('./log'):
+            shutil.rmtree('./log')
+
+        args = "--job_id ps4 --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903"
+        p1 = self.pdrun(args)
+        p1.wait()
+        self.assertTrue(p1.poll() == 0)
+
+        c = get_files('log', 'ps4')
+        self.assertTrue(len(c) == 5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
index 418155a865cb8..ddbee33c35bb1 100644
--- a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
@@ -67,6 +67,7 @@ class TestScatterNdAddSimpleOp(OpTest):
 
     def setUp(self):
         self.op_type = "scatter_nd_add"
+        self.python_api = paddle.scatter_nd_add
         ref_np = np.random.random([100]).astype("float64")
         index_np = np.random.randint(0, 100, [100, 1]).astype("int32")
         updates_np = np.random.random([100]).astype("float64")
@@ -76,10 +77,10 @@ def setUp(self):
         self.outputs = {'Out': expect_np}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X', 'Updates'], 'Out')
+        self.check_grad(['X', 'Updates'], 'Out', check_eager=True)
 
 
 class TestScatterNdAddWithEmptyIndex(OpTest):
@@ -89,6 +90,7 @@ class TestScatterNdAddWithEmptyIndex(OpTest):
 
     def setUp(self):
         self.op_type = "scatter_nd_add"
+        self.python_api = paddle.scatter_nd_add
         ref_np = np.random.random((10, 10)).astype("float64")
         index_np = np.array([[], []]).astype("int32")
         updates_np = np.random.random((2, 10, 10)).astype("float64")
@@ -99,10 +101,10 @@ def setUp(self):
         self.outputs = {'Out': expect_np}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X', 'Updates'], 'Out')
+        self.check_grad(['X', 'Updates'], 'Out', check_eager=True)
 
 
 class TestScatterNdAddWithHighRankSame(OpTest):
@@ -112,6 +114,7 @@ class TestScatterNdAddWithHighRankSame(OpTest):
 
     def setUp(self):
         self.op_type = "scatter_nd_add"
+        self.python_api = paddle.scatter_nd_add
         shape = (3, 2, 2, 1, 10)
         ref_np = np.random.rand(*shape).astype("float64")
         index_np = np.vstack(
@@ -125,10 +128,10 @@ def setUp(self):
         self.outputs = {'Out': expect_np}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X', 'Updates'], 'Out')
+        self.check_grad(['X', 'Updates'], 'Out', check_eager=True)
 
 
 class TestScatterNdAddWithHighRankDiff(OpTest):
@@ -138,6 +141,7 @@ class TestScatterNdAddWithHighRankDiff(OpTest):
 
     def setUp(self):
         self.op_type = "scatter_nd_add"
+        self.python_api = paddle.scatter_nd_add
         shape = (8, 2, 2, 1, 10)
         ref_np = np.random.rand(*shape).astype("double")
         index = np.vstack([np.random.randint(0, s, size=500) for s in shape]).T
@@ -150,10 +154,10 @@ def setUp(self):
         self.outputs = {'Out': expect_np}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X', 'Updates'], 'Out')
+        self.check_grad(['X', 'Updates'], 'Out', check_eager=True)
 
 
 #Test Python API
diff --git a/python/paddle/fluid/tests/unittests/test_scatter_op.py b/python/paddle/fluid/tests/unittests/test_scatter_op.py
index ad542da781670..5cb9b436b5a92 100644
--- a/python/paddle/fluid/tests/unittests/test_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py
@@ -27,6 +27,7 @@
 class TestScatterOp(OpTest):
     def setUp(self):
         self.op_type = "scatter"
+        self.python_api = paddle.scatter
         ref_np = np.ones((3, 50)).astype("float32")
         index_np = np.array([1, 2]).astype("int32")
         updates_np = np.random.random((2, 50)).astype("float32")
@@ -36,15 +37,16 @@ def setUp(self):
         self.outputs = {'Out': output_np}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(["X", "Updates"], "Out")
+        self.check_grad(["X", "Updates"], "Out", check_eager=True)
 
 
 class TestScatterOp0(OpTest):
     def setUp(self):
         self.op_type = "scatter"
+        self.python_api = paddle.scatter
         ref_np = np.ones((3, 3)).astype("float32")
         index_np = np.array([1, 2]).astype("int32")
         updates_np = np.random.random((2, 3)).astype("float32")
@@ -55,15 +57,16 @@ def setUp(self):
         self.outputs = {'Out': output_np}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(["X", "Updates"], "Out")
+        self.check_grad(["X", "Updates"], "Out", check_eager=True)
 
 
 class TestScatterOp1(OpTest):
     def setUp(self):
         self.op_type = "scatter"
+        self.python_api = paddle.scatter
         ref_np = np.ones((3, 3)).astype("float32")
         zeros_np = np.zeros([2, 3]).astype('float32')
         index_np = np.array([1, 1]).astype("int32")
@@ -77,10 +80,10 @@ def setUp(self):
         self.outputs = {'Out': output_np}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(["X", "Updates"], "Out")
+        self.check_grad(["X", "Updates"], "Out", check_eager=True)
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -88,6 +91,7 @@ def test_check_grad(self):
 class TestScatterOp2(OpTest):
     def setUp(self):
         self.op_type = "scatter"
+        self.python_api = paddle.scatter
         ref_np = np.ones((3, 3)).astype("float32")
         index_np = np.array([1, 2]).astype("int32")
         updates_np = np.random.random((2, 3)).astype("float32")
@@ -99,12 +103,13 @@ def setUp(self):
     def test_check_output(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-3)
+            self.check_output_with_place(place, atol=1e-3, check_eager=True)
 
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_grad_with_place(place, ['X', 'Updates'], 'Out')
+            self.check_grad_with_place(
+                place, ['X', 'Updates'], 'Out', check_eager=True)
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -112,6 +117,7 @@ def test_check_grad(self):
 class TestScatterOp3(OpTest):
     def setUp(self):
         self.op_type = "scatter"
+        self.python_api = paddle.scatter
         ref_np = np.ones((3, 3)).astype("float32")
         zeros_np = np.zeros([2, 3]).astype('float32')
         index_np = np.array([1, 1]).astype("int32")
@@ -127,17 +133,19 @@ def setUp(self):
     def test_check_output(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-3)
+            self.check_output_with_place(place, atol=1e-3, check_eager=True)
 
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_grad_with_place(place, ['X', 'Updates'], 'Out')
+            self.check_grad_with_place(
+                place, ['X', 'Updates'], 'Out', check_eager=True)
 
 
 class TestScatterOp4(OpTest):
     def setUp(self):
         self.op_type = "scatter"
+        self.python_api = paddle.scatter
         ref_np = np.ones((3, 3)).astype("float32")
         index_np = np.array([1, 2]).astype("int64")
         updates_np = np.random.random((2, 3)).astype("float32")
@@ -147,10 +155,10 @@ def setUp(self):
         self.outputs = {'Out': output_np}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X', 'Updates'], 'Out')
+        self.check_grad(['X', 'Updates'], 'Out', check_eager=True)
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -158,6 +166,7 @@ def test_check_grad(self):
 class TestScatterOp5(OpTest):
     def setUp(self):
         self.op_type = "scatter"
+        self.python_api = paddle.scatter
         ref_np = np.ones((3, 3)).astype("float32")
         index_np = np.array([1, 2]).astype("int64")
         updates_np = np.random.random((2, 3)).astype("float32")
@@ -169,12 +178,13 @@ def setUp(self):
     def test_check_output(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-3)
+            self.check_output_with_place(place, atol=1e-3, check_eager=True)
 
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_grad_with_place(place, ['X', 'Updates'], 'Out')
+            self.check_grad_with_place(
+                place, ['X', 'Updates'], 'Out', check_eager=True)
 
 
 class TestScatterAPI(unittest.TestCase):
@@ -274,6 +284,7 @@ def test_static_graph():
 class TestScatterOpFp16(OpTest):
     def setUp(self):
         self.__class__.op_type = "scatter"
+        self.python_api = paddle.scatter
         # compute grad in the following code handly.
         self.__class__.no_need_check_grad = True
         self.x_type = 'float16'
diff --git a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py
index 9a97f57aaae5f..74409c8671059 100644
--- a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py
@@ -178,4 +178,5 @@ def test_smooth_l1_loss_delta(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
new file mode 100644
index 0000000000000..8284771920e81
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+from paddle import _C_ops
+from paddle.fluid.framework import _test_eager_guard
+
+
+class TestSparseUtils(unittest.TestCase):
+    def test_to_sparse_coo(self):
+        with _test_eager_guard():
+            x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]]
+            non_zero_indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
+            non_zero_elements = [1, 2, 3, 4, 5]
+            dense_x = paddle.to_tensor(x)
+            #TODO(zhangkaihuo): change to test the corresponding API
+            out = _C_ops.final_state_to_sparse_coo(dense_x, 2)
+            print(out)
+            assert np.array_equal(out.non_zero_indices().numpy(),
+                                  non_zero_indices)
+            assert np.array_equal(out.non_zero_elements().numpy(),
+                                  non_zero_elements)
+
+            dense_tensor = _C_ops.final_state_to_dense(out)
+            assert np.array_equal(dense_tensor.numpy(), x)
+
+    def test_to_sparse_csr(self):
+        with _test_eager_guard():
+            x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]]
+            non_zero_crows = [0, 2, 3, 5]
+            non_zero_cols = [1, 3, 2, 0, 1]
+            non_zero_elements = [1, 2, 3, 4, 5]
+            dense_x = paddle.to_tensor(x)
+            out = _C_ops.final_state_to_sparse_csr(dense_x)
+            print(out)
+            assert np.array_equal(out.non_zero_crows().numpy(), non_zero_crows)
+            assert np.array_equal(out.non_zero_cols().numpy(), non_zero_cols)
+            assert np.array_equal(out.non_zero_elements().numpy(),
+                                  non_zero_elements)
+
+            dense_tensor = _C_ops.final_state_to_dense(out)
+            assert np.array_equal(dense_tensor.numpy(), x)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_attention.py b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_attention.py
new file mode 100644
index 0000000000000..e4ce8e8170fa1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_attention.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+
+import os
+import paddle
+
+paddle.enable_static()
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestStaticModelParallel(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl_comm_num = 1
+        self._pipeline_mode = True
+
+    def test_dist_static_model_parallel_fused_feedforward(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_fused_attention.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tile_op.py b/python/paddle/fluid/tests/unittests/test_tile_op.py
index b0f065a26a006..8359141f309f5 100644
--- a/python/paddle/fluid/tests/unittests/test_tile_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tile_op.py
@@ -22,7 +22,7 @@
 from paddle.fluid import compiler, Program, program_guard
 
 
-# Situation 1: repeat_times is a list (without tensor)
+#Situation 1: repeat_times is a list (without tensor)
 class TestTileOpRank1(OpTest):
     def setUp(self):
         self.op_type = "tile"
@@ -248,4 +248,5 @@ def test_api(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index 1e6b4354dd9c8..c890c3c607cb0 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -29,6 +29,7 @@ class TestTransposeOp(OpTest):
     def setUp(self):
         self.init_op_type()
         self.initTestCase()
+        self.python_api = paddle.transpose
         self.inputs = {'X': np.random.random(self.shape).astype("float64")}
         self.attrs = {
             'axis': list(self.axis),
@@ -44,10 +45,10 @@ def init_op_type(self):
         self.use_mkldnn = False
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'])
+        self.check_output(no_check_set=['XShape'], check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
 
     def initTestCase(self):
         self.shape = (3, 40)
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index dbd40c349bbc8..57a7f94bedce9 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -1361,4 +1361,5 @@ def test_copy_gradient_from(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py
index a3bfe3864a249..beaf361379b94 100644
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -333,7 +333,8 @@ def _test_slice_index_list_bool(self, place):
         with self.assertRaises(IndexError):
             res = x[[True, False, False]]
         with self.assertRaises(ValueError):
-            res = x[[False, False]]
+            with paddle.static.program_guard(prog):
+                res = x[[False, False]]
 
     def test_slice(self):
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_where_op.py b/python/paddle/fluid/tests/unittests/test_where_op.py
index 7fb4d39cd7338..4cfd243ddb46a 100644
--- a/python/paddle/fluid/tests/unittests/test_where_op.py
+++ b/python/paddle/fluid/tests/unittests/test_where_op.py
@@ -29,6 +29,7 @@
 class TestWhereOp(OpTest):
     def setUp(self):
         self.op_type = 'where'
+        self.python_api = paddle.where
         self.init_config()
         self.inputs = {'Condition': self.cond, 'X': self.x, 'Y': self.y}
         self.outputs = {'Out': np.where(self.cond, self.x, self.y)}
@@ -391,5 +392,6 @@ def test_eager(self):
             self.test_value_error()
 
 
-if (__name__ == '__main__'):
+if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
index 6a7e5f08b5e48..66f2e871dac46 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
@@ -84,13 +84,33 @@ class XPUTestSigmoid(TestActivationOPBase):
         def set_case(self):
             self.op_type = "sigmoid"
             self.dtype = self.in_type
+            self.init_config()
+            out = 1 / (1 + np.exp(-self.x))
 
-            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-            out = 1 / (1 + np.exp(-x))
             self.attrs = {'use_xpu': True}
-            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)}
             self.outputs = {'Out': out}
 
+        def init_config(self):
+            self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+
+    class XPUTestSigmoid2(XPUTestSigmoid):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [100]).astype(self.dtype)
+
+    class XPUTestSigmoid3(XPUTestSigmoid):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [10, 12, 15]).astype(self.dtype)
+
+    class XPUTestSigmoid4(XPUTestSigmoid):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [19, 19]).astype(self.dtype)
+
+    class XPUTestSigmoid5(XPUTestSigmoid):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2,
+                                       [10, 20, 30, 40]).astype(self.dtype)
+
 
 support_types = get_xpu_op_support_types('sigmoid')
 for stype in support_types:
@@ -292,14 +312,32 @@ class XPUTestSquare(TestActivationOPBase):
         def set_case(self):
             self.op_type = "square"
             self.dtype = self.in_type
-
-            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-            out = np.square(x)
+            self.init_config()
+            out = np.square(self.x)
 
             self.attrs = {'use_xpu': True}
-            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)}
             self.outputs = {'Out': out}
 
+        def init_config(self):
+            self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+
+    class XPUTestSquare2(XPUTestSquare):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [100]).astype(self.dtype)
+
+    class XPUTestSquare3(XPUTestSquare):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [1, 15, 19]).astype(self.dtype)
+
+    class XPUTestSquare4(XPUTestSquare):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [100, 10]).astype(self.dtype)
+
+    class XPUTestSquare5(XPUTestSquare):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [1, 2, 5, 17]).astype(self.dtype)
+
 
 support_types = get_xpu_op_support_types('square')
 for stype in support_types:
@@ -436,5 +474,473 @@ def ref_softplus(x, beta=1, threshold=20):
     return out
 
 
+# XPU_KP unittests, these ops can be found from xpu_op_kpfirst_list.h
+class XPUTestBReluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'brelu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestBRelu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "brelu"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-5, 10, [10, 12]).astype(self.dtype)
+            t_min = 1.0
+            t_max = 4.0
+            # The same with TestAbs
+            x[np.abs(x - t_min) < 0.005] = t_min + 0.02
+            x[np.abs(x - t_max) < 0.005] = t_max + 0.02
+            t = np.copy(x)
+            t[t < t_min] = t_min
+            t[t > t_max] = t_max
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': t}
+            self.attrs = {'use_xpu': True, 't_min': t_min, 't_max': t_max}
+
+
+support_types = get_xpu_op_support_types('brelu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestBReluOP, stype)
+
+
+class XPUTestCeilOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'ceil'
+        self.use_dynamic_create_class = False
+
+    class XPUTestCeil(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "ceil"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+            out = np.ceil(x)
+
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('ceil')
+for stype in support_types:
+    create_test_class(globals(), XPUTestCeilOP, stype)
+
+
+class XPUTestCeluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'celu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestCelu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "celu"
+            self.dtype = self.in_type
+
+            alpha = 1.5
+            x = np.random.uniform(-3, 3, [10, 12]).astype(self.dtype)
+            out = ref_celu(x, alpha)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True, 'alpha': alpha}
+
+
+support_types = get_xpu_op_support_types('celu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestCeluOP, stype)
+
+
+def ref_celu(x, alpha):
+    out_ref = np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x / alpha) - 1))
+    return out_ref.astype(x.dtype)
+
+
+class XPUTestEluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'elu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestElu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "elu"
+            self.dtype = self.in_type
+
+            alpha = 1.
+            x = np.random.uniform(-3, 3, [10, 12]).astype(self.dtype)
+            out = ref_elu(x, alpha)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True, 'alpha': alpha}
+
+
+support_types = get_xpu_op_support_types('elu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestEluOP, stype)
+
+
+def ref_elu(x, alpha):
+    out_ref = np.where(x > 0, x, alpha * (np.exp(x) - 1))
+    return out_ref.astype(x.dtype)
+
+
+class XPUTestFloorOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'floor'
+        self.use_dynamic_create_class = False
+
+    class XPUTestFloor(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "floor"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+            out = np.floor(x)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('floor')
+for stype in support_types:
+    create_test_class(globals(), XPUTestFloorOP, stype)
+
+
+class XPUTestHardShrinkOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'hard_shrink'
+        self.use_dynamic_create_class = False
+
+    class XPUTestHardShrink(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "hard_shrink"
+            self.dtype = self.in_type
+
+            threshold = 0.5
+            # self.set_attrs()
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype) * 10
+            out = ref_hardshrink(x, threshold)
+
+            self.attrs = {'use_xpu': True}
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+
+
+support_types = get_xpu_op_support_types('hard_shrink')
+for stype in support_types:
+    create_test_class(globals(), XPUTestHardShrinkOP, stype)
+
+
+def ref_hardshrink(x, threshold):
+    out = np.copy(x)
+    out[(out >= -threshold) & (out <= threshold)] = 0
+    return out
+
+
+class XPUTestHardSigmoidOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'hard_sigmoid'
+        self.use_dynamic_create_class = False
+
+    class XPUTestHardSigmoid(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "hard_sigmoid"
+            self.dtype = self.in_type
+            self.slope = 0.166666666666667
+            self.offset = 0.5
+
+            x = np.random.uniform(-5, 5, [10, 12]).astype(self.dtype)
+            lower_threshold = -self.offset / self.slope
+            upper_threshold = (1. - self.offset) / self.slope
+
+            # Same reason as TestAbs
+            delta = 0.005
+            x[np.abs(x - lower_threshold) < delta] = lower_threshold - 0.02
+            x[np.abs(x - upper_threshold) < delta] = upper_threshold - 0.02
+
+            out = ref_hardsigmoid(x, self.slope, self.offset)
+
+            self.attrs = {
+                'use_xpu': True,
+                'slope': self.slope,
+                'offset': self.offset
+            }
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+
+
+support_types = get_xpu_op_support_types('hard_sigmoid')
+for stype in support_types:
+    create_test_class(globals(), XPUTestHardSigmoidOP, stype)
+
+
+def ref_hardsigmoid(x, slope=0.166666666666667, offset=0.5):
+    return np.maximum(np.minimum(x * slope + offset, 1.), 0.).astype(x.dtype)
+
+
+class XPUTestLog1pOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'log1p'
+        self.use_dynamic_create_class = False
+
+    class XPUTestLog1p(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "log1p"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+            out = np.log1p(x)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('log1p')
+for stype in support_types:
+    create_test_class(globals(), XPUTestLog1pOP, stype)
+
+
+class XPUTestLogsigmoidOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'logsigmoid'
+        self.use_dynamic_create_class = False
+
+    class XPUTestLogsigmoid(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "logsigmoid"
+            self.dtype = self.in_type
+
+            np.random.seed(2048)
+            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+            out = np.log(1 / (1 + np.exp(-x)))
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('logsigmoid')
+for stype in support_types:
+    create_test_class(globals(), XPUTestLogsigmoidOP, stype)
+
+
+class XPUTestRelu6OP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'relu6'
+        self.use_dynamic_create_class = False
+
+    class XPUTestRelu6(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "relu6"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 10, [10, 12]).astype(self.dtype)
+            x[np.abs(x) < 0.005] = 0.02
+            out = ref_relu6(x)
+
+            self.attrs = {'use_xpu': True}
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+
+
+support_types = get_xpu_op_support_types('relu6')
+for stype in support_types:
+    create_test_class(globals(), XPUTestRelu6OP, stype)
+
+
+def ref_relu6(x, threshold=6.0):
+    out = np.copy(x)
+    out[np.abs(x - threshold) < 0.005] = threshold + 0.02
+    out = np.minimum(np.maximum(x, 0), threshold)
+    return out
+
+
+class XPUTestSiluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'silu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestSilu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "silu"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+            out = x / (np.exp(-x) + 1)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('silu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSiluOP, stype)
+
+
+class XPUTestSoftReluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'soft_relu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestSoftRelu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "soft_relu"
+            self.dtype = self.in_type
+
+            np.random.seed(4096)
+            x = np.random.uniform(-3, 3, [4, 4]).astype(self.dtype)
+            threshold = 2.0
+            # The same reason with TestAbs
+            x[np.abs(x - threshold) < 0.005] = threshold + 0.02
+            x[np.abs(x + threshold) < 0.005] = -threshold - 0.02
+            t = np.copy(x)
+            t[t < -threshold] = -threshold
+            t[t > threshold] = threshold
+            out = np.log((np.exp(t) + 1))
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True, 'threshold': threshold}
+
+
+support_types = get_xpu_op_support_types('soft_relu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSoftReluOP, stype)
+
+
+class XPUTestSoftSignOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'softsign'
+        self.use_dynamic_create_class = False
+
+    class XPUTestSoftSign(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "softsign"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+            out = ref_softsign(x)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('softsign')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSoftSignOP, stype)
+
+
+def ref_softsign(x):
+    out = np.divide(x, 1 + np.abs(x))
+    return out
+
+
+class XPUTestSoftshrinkOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'softshrink'
+        self.use_dynamic_create_class = False
+
+    class XPUTestSoftshrink(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "softshrink"
+            self.dtype = self.in_type
+
+            threshold = 0.5
+            np.random.seed(1023)
+            x = np.random.uniform(0.25, 10, [10, 12]).astype(self.dtype)
+            out = ref_softshrink(x, threshold)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('softshrink')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSoftshrinkOP, stype)
+
+
+def ref_softshrink(x, threshold=0.5):
+    out = np.copy(x)
+    out = (out < -threshold) * (out + threshold) + (out > threshold) * (
+        out - threshold)
+    return out
+
+
+class XPUTestSwishOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'swish'
+        self.use_dynamic_create_class = False
+
+    class XPUTestSwish(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "swish"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+            out = ref_swish(x)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('swish')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSwishOP, stype)
+
+
+def ref_swish(x):
+    from scipy.special import expit
+    out = x * expit(x)
+    return out
+
+
+class XPUTestThresholdedReluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'thresholded_relu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestThresholdedRelu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "thresholded_relu"
+            self.dtype = self.in_type
+
+            threshold = 1.0
+            np.random.seed(1024)
+            x = np.random.uniform(-20, 20, [10, 12]).astype(self.dtype)
+            x[np.abs(x) < 0.005] = 0.02
+            out = ref_thresholded_relu(x, threshold)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('thresholded_relu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestThresholdedReluOP, stype)
+
+
+def ref_thresholded_relu(x, threshold=1.0):
+    out = (x > threshold) * x
+    return out
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
index 78089d703891e..5f954659c2d9a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
@@ -23,6 +23,7 @@
 from op_test_xpu import XPUOpTest
 import paddle
 from paddle.fluid import Program, program_guard
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 
 def conv2d_forward_naive(input,
@@ -159,320 +160,334 @@ def init_paddings(self):
     globals()[cls_name] = TestPaddingVALIDCase
 
 
-class TestConv2DOp(XPUOpTest):
-    def setUp(self):
-        self.op_type = "conv2d"
-        self.use_cudnn = False
-        self.exhaustive_search = False
-        self.use_cuda = False
-        self.use_mkldnn = False
-        self.fuse_relu_before_depthwise_conv = False
-        self.data_format = "AnyLayout"
-        self.dtype = np.float32
-        self.init_kernel_type()
-        self.init_group()
-        self.init_dilation()
-        self.init_test_case()
-
-        conv2d_param = {
-            'stride': self.stride,
-            'pad': self.pad,
-            'dilation': self.dilations
-        }
-
-        input = np.random.random(self.input_size).astype(self.dtype)
-        if not self.has_cuda():
+class XPUTestConv2DOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'conv2d'
+        self.use_dynamic_create_class = False
+
+    class TestConv2DOp(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "conv2d"
+            self.use_cudnn = False
+            self.exhaustive_search = False
+            self.use_cuda = False
+            self.use_mkldnn = False
             self.fuse_relu_before_depthwise_conv = False
-        if self.fuse_relu_before_depthwise_conv:
-            input = input - 0.5
-            input -= (input < 0) * 0.1
-            input += (input >= 0) * 0.1
-            input2 = np.maximum(input, 0.0)
-        else:
-            input2 = input
-        filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype)
-
-        output, _, _, _, _ = conv2d_forward_naive(input2, filter, self.groups,
-                                                  conv2d_param)
-        output = output.astype(self.dtype)
-
-        self.inputs = {
-            'Input': XPUOpTest.np_dtype_to_fluid_dtype(input),
-            'Filter': XPUOpTest.np_dtype_to_fluid_dtype(filter)
-        }
-        self.attrs = {
-            'strides': self.stride,
-            'paddings': self.pad,
-            'groups': self.groups,
-            'dilations': self.dilations,
-            'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_mkldnn,
-            'data_format': self.data_format,
-            'fuse_relu_before_depthwise_conv':
-            self.fuse_relu_before_depthwise_conv,
-            'exhaustive_search': self.exhaustive_search
-        }
-        self.outputs = {'Output': output}
-
-    def has_cuda(self):
-        return core.is_compiled_with_cuda() and (self.use_cudnn or
-                                                 self.use_cuda)
-
-    def test_check_output(self):
-        if core.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad") and
-                                        self.no_need_check_grad == True):
-            return
-        if core.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, {'Input', 'Filter'}, 'Output')
-
-    def test_check_grad_no_filter(self):
-        if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad") and
-                                        self.no_need_check_grad == True):
-            return
-        if core.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(
-                place, ['Input'], 'Output', no_grad_set=set(['Filter']))
-
-    def test_check_grad_no_input(self):
-        if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad") and
-                                        self.no_need_check_grad == True):
-            return
-        if core.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(
-                place, ['Filter'], 'Output', no_grad_set=set(['Input']))
-
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-
-    def init_test_case_2(self):
-        pass
-
-    def init_dilation(self):
-        self.dilations = [1, 1]
-
-    def init_group(self):
-        self.groups = 1
-
-    def init_kernel_type(self):
-        pass
-
-
-class TestWithPad(TestConv2DOp):
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-
-
-class TestWithStride(TestConv2DOp):
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 6, 6]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-
-
-class TestWith1x1(TestConv2DOp):
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [120, f_c, 1, 1]
-
-    def init_group(self):
-        self.groups = 1
-
-
-# Please Don't remove the following code.
-# Currently, CI use cudnn V5.0 which not support dilation conv.
-# class TestCUDNNWithDilation(TestWithDilation):
-#     def init_op_type(self):
-#         self.op_type = "conv_cudnn"
+            self.data_format = "AnyLayout"
+            self.init_kernel_type()
+            self.init_group()
+            self.init_dilation()
+            self.init_test_case()
+
+            conv2d_param = {
+                'stride': self.stride,
+                'pad': self.pad,
+                'dilation': self.dilations
+            }
+
+            np.random.seed(100)
+            input = np.random.random(self.input_size).astype(self.dtype)
+            if not self.has_cuda():
+                self.fuse_relu_before_depthwise_conv = False
+            if self.fuse_relu_before_depthwise_conv:
+                input = input - 0.5
+                input -= (input < 0) * 0.1
+                input += (input >= 0) * 0.1
+                input2 = np.maximum(input, 0.0)
+            else:
+                input2 = input
+            np.random.seed(1)
+            filter = np.random.uniform(-1, 1,
+                                       self.filter_size).astype(self.dtype)
+
+            output, _, _, _, _ = conv2d_forward_naive(input2, filter,
+                                                      self.groups, conv2d_param)
+            output = output.astype(self.dtype)
+
+            self.inputs = {
+                'Input': XPUOpTest.np_dtype_to_fluid_dtype(input),
+                'Filter': XPUOpTest.np_dtype_to_fluid_dtype(filter)
+            }
+            self.attrs = {
+                'strides': self.stride,
+                'paddings': self.pad,
+                'groups': self.groups,
+                'dilations': self.dilations,
+                'use_cudnn': self.use_cudnn,
+                'use_mkldnn': self.use_mkldnn,
+                'data_format': self.data_format,
+                'fuse_relu_before_depthwise_conv':
+                self.fuse_relu_before_depthwise_conv,
+                'exhaustive_search': self.exhaustive_search
+            }
+            self.outputs = {'Output': output}
+
+        def has_cuda(self):
+            return core.is_compiled_with_cuda() and (self.use_cudnn or
+                                                     self.use_cuda)
+
+        def test_check_output(self):
+            if core.is_compiled_with_xpu():
+                paddle.enable_static()
+                self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            if (hasattr(self, "no_need_check_grad") and
+                    self.no_need_check_grad == True):
+                return
+            if core.is_compiled_with_xpu():
+                paddle.enable_static()
+                self.check_grad_with_place(self.place, {'Input', 'Filter'},
+                                           'Output')
+
+        def test_check_grad_no_filter(self):
+            if (hasattr(self, "no_need_check_grad") and
+                    self.no_need_check_grad == True):
+                return
+            if core.is_compiled_with_xpu():
+                paddle.enable_static()
+                self.check_grad_with_place(
+                    self.place, ['Input'],
+                    'Output',
+                    no_grad_set=set(['Filter']))
+
+        def test_check_grad_no_input(self):
+            if (hasattr(self, "no_need_check_grad") and
+                    self.no_need_check_grad == True):
+                return
+            if core.is_compiled_with_xpu():
+                paddle.enable_static()
+                self.check_grad_with_place(
+                    self.place, ['Filter'],
+                    'Output',
+                    no_grad_set=set(['Input']))
+
+        def init_test_case(self):
+            self.pad = [0, 0]
+            self.stride = [1, 1]
+            self.input_size = [2, 3, 5, 5]  # NCHW
+            assert np.mod(self.input_size[1], self.groups) == 0
+            f_c = self.input_size[1] // self.groups
+            self.filter_size = [6, f_c, 3, 3]
 
-# ---- test asymmetric padding ----
+        def init_test_case_2(self):
+            pass
+
+        def init_dilation(self):
+            self.dilations = [1, 1]
+
+        def init_group(self):
+            self.groups = 1
+
+        def init_kernel_type(self):
+            pass
+
+    class TestWithPad(TestConv2DOp):
+        def init_test_case(self):
+            self.pad = [1, 1]
+            self.stride = [1, 1]
+            self.input_size = [2, 3, 5, 5]  # NCHW
+            assert np.mod(self.input_size[1], self.groups) == 0
+            f_c = self.input_size[1] // self.groups
+            self.filter_size = [6, f_c, 3, 3]
+
+    class TestWithStride(TestConv2DOp):
+        def init_test_case(self):
+            self.pad = [1, 1]
+            self.stride = [2, 2]
+            self.input_size = [2, 3, 6, 6]  # NCHW
+            assert np.mod(self.input_size[1], self.groups) == 0
+            f_c = self.input_size[1] // self.groups
+            self.filter_size = [6, f_c, 3, 3]
+
+    class TestWith1x1(TestConv2DOp):
+        def init_test_case(self):
+            self.pad = [0, 0]
+            self.stride = [1, 1]
+            self.input_size = [2, 3, 5, 5]  # NCHW
+            assert np.mod(self.input_size[1], self.groups) == 0
+            f_c = self.input_size[1] // self.groups
+            self.filter_size = [120, f_c, 1, 1]
 
+        def init_group(self):
+            self.groups = 1
 
-class TestConv2DOp_v2(XPUOpTest):
-    def setUp(self):
-        self.op_type = "conv2d"
-        self.use_cudnn = False
-        self.exhaustive_search = False
-        self.use_cuda = False
-        self.use_mkldnn = False
-        self.fuse_relu_before_depthwise_conv = False
-        self.dtype = np.float32
-        self.init_kernel_type()
-        self.init_group()
-        self.init_dilation()
-        self.init_data_format()
-        self.init_test_case()
-        self.init_paddings()
-        self.init_test_case_2()
-
-        conv2d_param = {
-            'stride': self.stride,
-            'pad': self.pad,
-            'dilation': self.dilations
-        }
-
-        input = np.random.random(self.input_size).astype(self.dtype)
-        if not self.has_cuda():
+
+# ---- test asymmetric padding ----
+class XPUTestConv2DOp_v2(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'conv2d'
+        self.use_dynamic_create_class = False
+
+    class TestConv2DOp_v2(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "conv2d"
+            self.use_cudnn = False
+            self.exhaustive_search = False
+            self.use_cuda = False
+            self.use_mkldnn = False
             self.fuse_relu_before_depthwise_conv = False
-        if self.fuse_relu_before_depthwise_conv:
-            input = input - 0.5
-            input -= (input < 0) * 0.1
-            input += (input >= 0) * 0.1
-            input2 = np.maximum(input, 0.0)
-        else:
-            input2 = input
-        filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype)
-        output, _, _, _, _ = conv2d_forward_naive(
-            input2, filter, self.groups, conv2d_param, self.padding_algorithm,
-            self.data_format)
-        output = output.astype(self.dtype)
-
-        self.inputs = {
-            'Input': XPUOpTest.np_dtype_to_fluid_dtype(input),
-            'Filter': XPUOpTest.np_dtype_to_fluid_dtype(filter)
-        }
-        self.attrs = {
-            'strides': self.stride,
-            'paddings': self.pad,
-            'padding_algorithm': self.padding_algorithm,
-            'groups': self.groups,
-            'dilations': self.dilations,
-            'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_mkldnn,
-            'data_format': self.data_format,
-            'fuse_relu_before_depthwise_conv':
-            self.fuse_relu_before_depthwise_conv,
-            'exhaustive_search': self.exhaustive_search
-        }
-        self.outputs = {'Output': output}
-
-    def has_cuda(self):
-        return core.is_compiled_with_cuda() and (self.use_cudnn or
-                                                 self.use_cuda)
-
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        if core.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        if self.dtype == np.float16:
-            return
-        if core.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, {'Input', 'Filter'}, 'Output')
-
-    def test_check_grad_no_filter(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        if self.dtype == np.float16:
-            return
-        if core.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(
-                place, ['Input'], 'Output', no_grad_set=set(['Filter']))
-
-    def test_check_grad_no_input(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        if self.dtype == np.float16:
-            return
-        if core.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(
-                place, ['Filter'], 'Output', no_grad_set=set(['Input']))
-
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 2]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 4, 3]
-
-    def init_dilation(self):
-        self.dilations = [1, 1]
-
-    def init_group(self):
-        self.groups = 1
-
-    def init_kernel_type(self):
-        pass
-
-    def init_paddings(self):
-        self.pad = [0, 0]
-        self.padding_algorithm = "EXPLICIT"
-
-    def init_data_format(self):
-        self.data_format = "NCHW"
-
-    def init_test_case_2(self):
-        pass
-
-
-class TestConv2DOp_AsyPadding(TestConv2DOp_v2):
-    def init_paddings(self):
-        self.pad = [0, 0, 0, 0]
-        self.padding_algorithm = "EXPLICIT"
-
-
-class TestWithPad_AsyPadding(TestConv2DOp_v2):
-    def init_test_case(self):
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-
-    def init_paddings(self):
-        self.pad = [1, 1, 1, 1]
-        self.padding_algorithm = "EXPLICIT"
-
-
-class TestWithStride_AsyPadding(TestConv2DOp_v2):
-    def init_test_case(self):
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 6, 6]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-
-    def init_paddings(self):
-        self.pad = [1, 1, 1, 1]
-        self.padding_algorithm = "EXPLICIT"
+            self.init_kernel_type()
+            self.init_group()
+            self.init_dilation()
+            self.init_data_format()
+            self.init_test_case()
+            self.init_paddings()
+            self.init_test_case_2()
+
+            conv2d_param = {
+                'stride': self.stride,
+                'pad': self.pad,
+                'dilation': self.dilations
+            }
+
+            np.random.seed(100)
+            input = np.random.random(self.input_size).astype(self.dtype)
+            if not self.has_cuda():
+                self.fuse_relu_before_depthwise_conv = False
+            if self.fuse_relu_before_depthwise_conv:
+                input = input - 0.5
+                input -= (input < 0) * 0.1
+                input += (input >= 0) * 0.1
+                input2 = np.maximum(input, 0.0)
+            else:
+                input2 = input
+            np.random.seed(8)
+            filter = np.random.uniform(-1, 1,
+                                       self.filter_size).astype(self.dtype)
+            output, _, _, _, _ = conv2d_forward_naive(
+                input2, filter, self.groups, conv2d_param,
+                self.padding_algorithm, self.data_format)
+            output = output.astype(self.dtype)
+
+            self.inputs = {
+                'Input': XPUOpTest.np_dtype_to_fluid_dtype(input),
+                'Filter': XPUOpTest.np_dtype_to_fluid_dtype(filter)
+            }
+            self.attrs = {
+                'strides': self.stride,
+                'paddings': self.pad,
+                'padding_algorithm': self.padding_algorithm,
+                'groups': self.groups,
+                'dilations': self.dilations,
+                'use_cudnn': self.use_cudnn,
+                'use_mkldnn': self.use_mkldnn,
+                'data_format': self.data_format,
+                'fuse_relu_before_depthwise_conv':
+                self.fuse_relu_before_depthwise_conv,
+                'exhaustive_search': self.exhaustive_search
+            }
+            self.outputs = {'Output': output}
+
+        def has_cuda(self):
+            return core.is_compiled_with_cuda() and (self.use_cudnn or
+                                                     self.use_cuda)
+
+        def test_check_output(self):
+            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            if core.is_compiled_with_xpu():
+                paddle.enable_static()
+                self.check_output_with_place(place=self.place)
+
+        def test_check_grad(self):
+            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            if (hasattr(self, "no_need_check_grad") and
+                    self.no_need_check_grad == True):
+                return
+            if core.is_compiled_with_xpu():
+                paddle.enable_static()
+                self.check_grad_with_place(self.place, {'Input', 'Filter'},
+                                           'Output')
+
+        def test_check_grad_no_filter(self):
+            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            if (hasattr(self, "no_need_check_grad") and
+                    self.no_need_check_grad == True):
+                return
+            if core.is_compiled_with_xpu():
+                paddle.enable_static()
+                self.check_grad_with_place(
+                    self.place, ['Input'],
+                    'Output',
+                    no_grad_set=set(['Filter']))
+
+        def test_check_grad_no_input(self):
+            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            if (hasattr(self, "no_need_check_grad") and
+                    self.no_need_check_grad == True):
+                return
+            if core.is_compiled_with_xpu():
+                paddle.enable_static()
+                self.check_grad_with_place(
+                    self.place, ['Filter'],
+                    'Output',
+                    no_grad_set=set(['Input']))
+
+        def init_test_case(self):
+            self.pad = [0, 0]
+            self.stride = [1, 2]
+            self.input_size = [2, 3, 5, 5]  # NCHW
+            assert np.mod(self.input_size[1], self.groups) == 0
+            f_c = self.input_size[1] // self.groups
+            self.filter_size = [6, f_c, 4, 3]
+
+        def init_dilation(self):
+            self.dilations = [1, 1]
+
+        def init_group(self):
+            self.groups = 1
+
+        def init_kernel_type(self):
+            pass
+
+        def init_paddings(self):
+            self.pad = [0, 0]
+            self.padding_algorithm = "EXPLICIT"
+
+        def init_data_format(self):
+            self.data_format = "NCHW"
+
+        def init_test_case_2(self):
+            pass
+
+    class TestConv2DOp_AsyPadding(TestConv2DOp_v2):
+        def init_paddings(self):
+            self.pad = [0, 0, 0, 0]
+            self.padding_algorithm = "EXPLICIT"
+
+    class TestWithPad_AsyPadding(TestConv2DOp_v2):
+        def init_test_case(self):
+            self.stride = [1, 1]
+            self.input_size = [2, 3, 5, 5]  # NCHW
+            assert np.mod(self.input_size[1], self.groups) == 0
+            f_c = self.input_size[1] // self.groups
+            self.filter_size = [6, f_c, 3, 3]
+
+        def init_paddings(self):
+            self.pad = [1, 1, 1, 1]
+            self.padding_algorithm = "EXPLICIT"
+
+    class TestWithStride_AsyPadding(TestConv2DOp_v2):
+        def init_test_case(self):
+            self.stride = [2, 2]
+            self.input_size = [2, 3, 6, 6]  # NCHW
+            assert np.mod(self.input_size[1], self.groups) == 0
+            f_c = self.input_size[1] // self.groups
+            self.filter_size = [6, f_c, 3, 3]
+
+        def init_paddings(self):
+            self.pad = [1, 1, 1, 1]
+            self.padding_algorithm = "EXPLICIT"
+
 
+support_types = get_xpu_op_support_types('conv2d')
+for stype in support_types:
+    create_test_class(globals(), XPUTestConv2DOp, stype)
+    create_test_class(globals(), XPUTestConv2DOp_v2, stype)
 
 #---------- test SAME VALID -----------
 #create_test_padding_SAME_class(TestConv2DOp_AsyPadding)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
index 45d60c8538e09..9891da6ea21d9 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
@@ -289,6 +289,18 @@ def config(self):
         self.trans_y = False
 
 
+class TestMatMulOp18(TestMatMulV2Op):
+    """
+    case 18 : for ppyoloe model
+    """
+
+    def config(self):
+        self.x_shape = (8, 111, 4, 17)
+        self.y_shape = (17)
+        self.trans_x = False
+        self.trans_y = False
+
+
 # class TestMatMulOpBroadcast1(TestMatMulV2Op):
 #     """
 #     case 14_3
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
index 2ad79dd0cca00..9999217041859 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
@@ -21,6 +21,8 @@
 import sys
 sys.path.append("../")
 from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types
+from xpu.get_test_cover_info import XPUOpTestWrapper
 
 paddle.enable_static()
 np.set_printoptions(threshold=np.inf)
@@ -73,188 +75,198 @@ def seqconv(x,
     return np.dot(col, filter)
 
 
-class TestSeqProject(XPUOpTest):
-    def setUp(self):
-        self.init_test_case()
-        self.op_type = 'sequence_conv'
-        self.use_xpu = True
-
-        if self.context_length == 1 \
-                and self.context_start == 0 \
-                and self.padding_trainable:
-            print("If context_start is 0 " \
-                  "and context_length is 1," \
-                  " padding_trainable should be false.")
-            return
-
-        # one level, batch size
-        x = np.random.uniform(-6.10907e-05, 0.000104218,
-                              [self.input_size[0],
-                               self.input_size[1]]).astype('float32')
-        w = np.random.uniform(-3.17068e-05, 0.000159822, [
-            self.context_length * self.input_size[1], self.output_represention
-        ]).astype('float32')
-
-        begin_pad = np.max([0, -self.context_start])
-        end_pad = np.max([0, self.context_start + self.context_length - 1])
-        total_pad = begin_pad + end_pad
-        padding_data = np.random.uniform(
-            0, 0, [total_pad, self.input_size[1]]).astype('float32')
-        self.pad_data = padding_data
-        self.inputs = {
-            'X': (x, self.lod),
-            'Filter': w,
-        }
-        self.inputs_val = ['X', 'Filter']
-        self.inputs_val_no_x = ['Filter']
-        self.inputs_val_no_f = ['X']
-
-        if total_pad != 0:
-            self.inputs['PaddingData'] = padding_data
-            self.inputs_val = ['X', 'PaddingData', 'Filter']
-            self.inputs_val_no_x = ['PaddingData', 'Filter']
-            self.inputs_val_no_f = ['PaddingData', 'X']
-
-        self.attrs = {
-            'contextStart': self.context_start,
-            'contextLength': self.context_length,
-            'paddingTrainable': self.padding_trainable,
-            'contextStride': self.context_stride
-        }
-        out = seqconv(x, self.lod, w, self.context_length, self.context_start,
-                      self.padding_trainable, self.pad_data)
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-    def test_check_grad_input(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set(self.inputs_val_no_x))
-
-    def test_check_grad_padding_data(self):
-        if self.padding_trainable:
+class XPUTestSequenceConv(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'sequence_conv'
+
+    class TestSeqProject(XPUOpTest):
+        def setUp(self):
+            self.init_test_case()
+            self.op_type = 'sequence_conv'
+            self.dtype = self.in_type
+            self.use_xpu = True
+
+            if self.context_length == 1 \
+                    and self.context_start == 0 \
+                    and self.padding_trainable:
+                print("If context_start is 0 " \
+                      "and context_length is 1," \
+                      " padding_trainable should be false.")
+                return
+
+            # one level, batch size
+            x = np.random.uniform(-6.10907e-05, 0.000104218,
+                                  [self.input_size[0],
+                                   self.input_size[1]]).astype(self.dtype)
+            w = np.random.uniform(-3.17068e-05, 0.000159822, [
+                self.context_length * self.input_size[1],
+                self.output_represention
+            ]).astype(self.dtype)
+
+            begin_pad = np.max([0, -self.context_start])
+            end_pad = np.max([0, self.context_start + self.context_length - 1])
+            total_pad = begin_pad + end_pad
+            padding_data = np.random.uniform(
+                0, 0, [total_pad, self.input_size[1]]).astype(self.dtype)
+            self.pad_data = padding_data
+            self.inputs = {
+                'X': (x, self.lod),
+                'Filter': w,
+            }
+            self.inputs_val = ['X', 'Filter']
+            self.inputs_val_no_x = ['Filter']
+            self.inputs_val_no_f = ['X']
+
+            if total_pad != 0:
+                self.inputs['PaddingData'] = padding_data
+                self.inputs_val = ['X', 'PaddingData', 'Filter']
+                self.inputs_val_no_x = ['PaddingData', 'Filter']
+                self.inputs_val_no_f = ['PaddingData', 'X']
+
+            self.attrs = {
+                'contextStart': self.context_start,
+                'contextLength': self.context_length,
+                'paddingTrainable': self.padding_trainable,
+                'contextStride': self.context_stride
+            }
+            out = seqconv(x, self.lod, w, self.context_length,
+                          self.context_start, self.padding_trainable,
+                          self.pad_data)
+            self.outputs = {'Out': out}
+
+        def test_check_output(self):
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+        def test_check_grad_input(self):
+            self.check_grad(['X'], 'Out', no_grad_set=set(self.inputs_val_no_x))
+
+        def test_check_grad_padding_data(self):
+            if self.padding_trainable:
+                self.check_grad(
+                    ['PaddingData'], 'Out', no_grad_set=set(['X', 'Filter']))
+
+        def test_check_grad_Filter(self):
             self.check_grad(
-                ['PaddingData'], 'Out', no_grad_set=set(['X', 'Filter']))
-
-    def test_check_grad_Filter(self):
-        self.check_grad(
-            ['Filter'], 'Out', no_grad_set=set(self.inputs_val_no_f))
-
-    def test_check_grad_input_filter(self):
-        if self.padding_trainable:
-            self.check_grad(
-                ['X', 'Filter'], 'Out', no_grad_set=set(['PaddingData']))
-
-    def test_check_grad_padding_input(self):
-        if self.padding_trainable:
-            self.check_grad(
-                self.inputs_val_no_f, 'Out', no_grad_set=set(['Filter']))
-
-    def test_check_grad_padding_filter(self):
-        if self.padding_trainable:
-            self.check_grad(self.inputs_val_no_x, 'Out', no_grad_set=set(['X']))
-
-    def init_test_case(self):
-        self.input_row = 7
-        self.input_col = 25
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, self.input_col]
-        offset_lod = [[0, 1, self.input_row]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase1(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 11
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, 50]
-        offset_lod = [[0, 4, 5, 8, self.input_row]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase2Len0(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 11
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, 50]
-        offset_lod = [[0, 0, 4, 5, 5, 8, self.input_row, self.input_row]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase3(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 25
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, 25]
-        idx = list(range(self.input_size[0]))
-        del idx[0]
-        offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
-                      [self.input_size[0]]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase4(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 7835
-        self.input_col = 128
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, self.input_col]
-        offset_lod = [[
-            0, 1, 2, 3, 131, 241, 242, 263, 264, 265, 266, 267, 268, 387, 515,
-            516, 644, 645, 772, 794, 922, 923, 924, 944, 945, 1073, 1074, 1202,
-            1330, 1458, 1556, 1557, 1558, 1686, 1748, 1876, 1912, 1913, 1914,
-            2032, 2066, 2194, 2308, 2309, 2347, 2475, 2476, 2477, 2478, 2606,
-            2607, 2735, 2736, 2737, 2738, 2838, 2966, 2967, 2968, 2969, 3097,
-            3225, 3353, 3481, 3482, 3520, 3642, 3643, 3754, 3882, 3883, 4010,
-            4011, 4012, 4140, 4219, 4228, 4356, 4357, 4415, 4475, 4476, 4604,
-            4605, 4606, 4694, 4695, 4808, 4936, 4961, 4962, 5004, 5132, 5260,
-            5312, 5440, 5441, 5569, 5570, 5675, 5676, 5750, 5810, 5811, 5939,
-            6021, 6149, 6277, 6278, 6364, 6425, 6519, 6647, 6648, 6739, 6867,
-            6995, 6996, 7120, 7223, 7244, 7367, 7407, 7408, 7467, 7595, 7699,
-            7827, 7835
-        ]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
+                ['Filter'], 'Out', no_grad_set=set(self.inputs_val_no_f))
+
+        def test_check_grad_input_filter(self):
+            if self.padding_trainable:
+                self.check_grad(
+                    ['X', 'Filter'], 'Out', no_grad_set=set(['PaddingData']))
+
+        def test_check_grad_padding_input(self):
+            if self.padding_trainable:
+                self.check_grad(
+                    self.inputs_val_no_f, 'Out', no_grad_set=set(['Filter']))
+
+        def test_check_grad_padding_filter(self):
+            if self.padding_trainable:
+                self.check_grad(
+                    self.inputs_val_no_x, 'Out', no_grad_set=set(['X']))
+
+        def init_test_case(self):
+            self.input_row = 7
+            self.input_col = 25
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, self.input_col]
+            offset_lod = [[0, 1, self.input_row]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+    class TestSeqProjectCase1(TestSeqProject):
+        def init_test_case(self):
+            self.input_row = 11
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, 50]
+            offset_lod = [[0, 4, 5, 8, self.input_row]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+    class TestSeqProjectCase2Len0(TestSeqProject):
+        def init_test_case(self):
+            self.input_row = 11
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, 50]
+            offset_lod = [[0, 0, 4, 5, 5, 8, self.input_row, self.input_row]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+    class TestSeqProjectCase3(TestSeqProject):
+        def init_test_case(self):
+            self.input_row = 25
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, 25]
+            idx = list(range(self.input_size[0]))
+            del idx[0]
+            offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
+                          [self.input_size[0]]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+    class TestSeqProjectCase4(TestSeqProject):
+        def init_test_case(self):
+            self.input_row = 7835
+            self.input_col = 128
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, self.input_col]
+            offset_lod = [[
+                0, 1, 2, 3, 131, 241, 242, 263, 264, 265, 266, 267, 268, 387,
+                515, 516, 644, 645, 772, 794, 922, 923, 924, 944, 945, 1073,
+                1074, 1202, 1330, 1458, 1556, 1557, 1558, 1686, 1748, 1876,
+                1912, 1913, 1914, 2032, 2066, 2194, 2308, 2309, 2347, 2475,
+                2476, 2477, 2478, 2606, 2607, 2735, 2736, 2737, 2738, 2838,
+                2966, 2967, 2968, 2969, 3097, 3225, 3353, 3481, 3482, 3520,
+                3642, 3643, 3754, 3882, 3883, 4010, 4011, 4012, 4140, 4219,
+                4228, 4356, 4357, 4415, 4475, 4476, 4604, 4605, 4606, 4694,
+                4695, 4808, 4936, 4961, 4962, 5004, 5132, 5260, 5312, 5440,
+                5441, 5569, 5570, 5675, 5676, 5750, 5810, 5811, 5939, 6021,
+                6149, 6277, 6278, 6364, 6425, 6519, 6647, 6648, 6739, 6867,
+                6995, 6996, 7120, 7223, 7244, 7367, 7407, 7408, 7467, 7595,
+                7699, 7827, 7835
+            ]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+
+support_types = get_xpu_op_support_types('sequence_conv')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSequenceConv, stype)
 
 
 class TestSeqConvApi(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
index 8f3578b526e1e..3d7c9959db9ea 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
@@ -18,169 +18,174 @@
 import unittest
 sys.path.append("..")
 from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
 
 # Situation 1: starts(list, no tensor), ends(list, no tensor)
 # 1.1 without attr(decrease)
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp(OpTest):
-    def setUp(self):
-        self.op_type = "slice"
-        self.config()
-        self.inputs = {'Input': self.input}
-        self.outputs = {'Out': self.out}
-        self.attrs = {
-            'axes': self.axes,
-            'starts': self.starts,
-            'ends': self.ends,
-            'infer_flags': self.infer_flags,
-            "use_xpu": True
-        }
-
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [3, 3, 4]
-        self.axes = [0, 1, 2]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[1:3, 0:3, 2:4, :]
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-    def test_check_grad_normal(self):
-        place = paddle.XPUPlace(0)
-        self.check_grad_with_place(place, ['Input'], 'Out')
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestCase1(TestSliceOp):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [-3, 0, 2]
-        self.ends = [3, 100, -1]
-        self.axes = [0, 1, 2]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[-3:3, 0:100, 2:-1, :]
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestCase2(TestSliceOp):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [-3, 0, 2]
-        self.ends = [3, 100, -1]
-        self.axes = [0, 1, 3]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[-3:3, 0:100, :, 2:-1]
+class XPUTestSliceOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'slice'
+        self.use_dynamic_create_class = False
+
+    class TestSliceOp(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "slice"
+            self.config()
+            self.inputs = {'Input': self.input}
+            self.outputs = {'Out': self.out}
+            self.attrs = {
+                'axes': self.axes,
+                'starts': self.starts,
+                'ends': self.ends,
+                'infer_flags': self.infer_flags,
+                "use_xpu": True
+            }
+
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [1, 0, 2]
+            self.ends = [3, 3, 4]
+            self.axes = [0, 1, 2]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[1:3, 0:3, 2:4, :]
+
+        def test_check_grad_normal(self):
+            if self.dtype == np.float16:
+                self.check_grad_with_place(self.place, ['Input'], 'Out')
+            else:
+                user_defined_grad_outputs = np.random.random(
+                    self.out.shape).astype(self.dtype)
+                self.check_grad_with_place(
+                    self.place, ['Input'],
+                    'Out',
+                    user_defined_grad_outputs=user_defined_grad_outputs)
+
+    class TestCase1(TestSliceOp):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [-3, 0, 2]
+            self.ends = [3, 100, -1]
+            self.axes = [0, 1, 2]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[-3:3, 0:100, 2:-1, :]
+
+    class TestCase2(TestSliceOp):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [-3, 0, 2]
+            self.ends = [3, 100, -1]
+            self.axes = [0, 1, 3]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[-3:3, 0:100, :, 2:-1]
 
 
 # 1.2 with attr(decrease)
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp_decs_dim(OpTest):
-    def setUp(self):
-        self.op_type = "slice"
-        self.config()
-        self.inputs = {'Input': self.input}
-        self.outputs = {'Out': self.out}
-        self.attrs = {
-            'axes': self.axes,
-            'starts': self.starts,
-            'ends': self.ends,
-            'infer_flags': self.infer_flags,
-            'decrease_axis': self.decrease_axis,
-            "use_xpu": True
-        }
-
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [2, 3, 4]
-        self.axes = [0, 1, 2]
-        self.decrease_axis = [0]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[1, 0:3, 2:4, :]
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-    def test_check_grad_normal(self):
-        place = paddle.XPUPlace(0)
-        self.check_grad_with_place(place, ['Input'], 'Out')
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [2, 1, 4]
-        self.axes = [0, 1, 2]
-        self.decrease_axis = [0, 1]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[1, 0, 2:4, :]
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [-1, 0, 2]
-        self.ends = [1000000, 1, 4]
-        self.axes = [0, 1, 2]
-        self.decrease_axis = [0, 1]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[-1, 0, 2:4, :]
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 7]).astype("float32")
-        self.starts = [0, 1, 2, 3]
-        self.ends = [1, 2, 3, 4]
-        self.axes = [0, 1, 2, 3]
-        self.decrease_axis = [0, 1, 2, 3]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[0, 1, 2, 3:4]
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [-1]
-        self.ends = [1000000]
-        self.axes = [3]
-        self.decrease_axis = [3]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[:, :, :, -1]
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [0, 1, 2, 3]
-        self.ends = [1, 2, 3, 4]
-        self.axes = [0, 1, 2, 3]
-        self.decrease_axis = [0, 1, 2, 3]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[0, 1, 2, 3:4]
-
+class XPUTestSliceOp_decs_dim(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'slice'
+        self.use_dynamic_create_class = False
+
+    class TestSliceOp_decs_dim(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "slice"
+            self.config()
+            self.inputs = {'Input': self.input}
+            self.outputs = {'Out': self.out}
+            self.attrs = {
+                'axes': self.axes,
+                'starts': self.starts,
+                'ends': self.ends,
+                'infer_flags': self.infer_flags,
+                'decrease_axis': self.decrease_axis,
+                "use_xpu": True
+            }
+
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [1, 0, 2]
+            self.ends = [2, 3, 4]
+            self.axes = [0, 1, 2]
+            self.decrease_axis = [0]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[1, 0:3, 2:4, :]
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad_normal(self):
+            if self.dtype == np.float16:
+                self.check_grad_with_place(self.place, ['Input'], 'Out')
+            else:
+                user_defined_grad_outputs = np.random.random(
+                    self.out.shape).astype(self.dtype)
+                self.check_grad_with_place(
+                    self.place, ['Input'],
+                    'Out',
+                    user_defined_grad_outputs=user_defined_grad_outputs)
+
+    class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [1, 0, 2]
+            self.ends = [2, 1, 4]
+            self.axes = [0, 1, 2]
+            self.decrease_axis = [0, 1]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[1, 0, 2:4, :]
+
+    class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [-1, 0, 2]
+            self.ends = [1000000, 1, 4]
+            self.axes = [0, 1, 2]
+            self.decrease_axis = [0, 1]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[-1, 0, 2:4, :]
+
+    class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 7]).astype(self.dtype)
+            self.starts = [0, 1, 2, 3]
+            self.ends = [1, 2, 3, 4]
+            self.axes = [0, 1, 2, 3]
+            self.decrease_axis = [0, 1, 2, 3]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[0, 1, 2, 3:4]
+
+    class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [-1]
+            self.ends = [1000000]
+            self.axes = [3]
+            self.decrease_axis = [3]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[:, :, :, -1]
+
+    class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [0, 1, 2, 3]
+            self.ends = [1, 2, 3, 4]
+            self.axes = [0, 1, 2, 3]
+            self.decrease_axis = [0, 1, 2, 3]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[0, 1, 2, 3:4]
+
+
+support_types = get_xpu_op_support_types('slice')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSliceOp, stype)
+    create_test_class(globals(), XPUTestSliceOp_decs_dim, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
index d010e1633578e..cd18bd63a88f7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
@@ -24,221 +24,158 @@
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 from paddle.fluid import core
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 np.random.seed(10)
 
 
 #Situation 1: repeat_times is a list (without tensor)
-class TestTileOpRank1(XPUOpTest):
-    def setUp(self):
-        self.set_xpu()
-        self.place = paddle.XPUPlace(0)
-        self.op_type = "tile"
-        self.init_data()
-
-        self.inputs = {'X': np.random.random(self.ori_shape).astype("float32")}
-        self.attrs = {'repeat_times': self.repeat_times}
-        output = np.tile(self.inputs['X'], self.repeat_times)
-        self.outputs = {'Out': output}
-
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def init_data(self):
-        self.ori_shape = [100]
-        self.repeat_times = [2]
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
-
-    def test_check_grad(self):
-        pass
-
-
-#with dimension expanding
-class TestTileOpRank2Expanding(TestTileOpRank1):
-    def init_data(self):
-        self.ori_shape = [120]
-        self.repeat_times = [2, 2]
-
-
-class TestTileOpRank2(TestTileOpRank1):
-    def init_data(self):
-        self.ori_shape = [12, 14]
-        self.repeat_times = [2, 3]
-
-
-class TestTileOpRank3_Corner(TestTileOpRank1):
-    def init_data(self):
-        self.ori_shape = (2, 10, 5)
-        self.repeat_times = (1, 1, 1)
-
-
-class TestTileOpRank3_Corner2(TestTileOpRank1):
-    def init_data(self):
-        self.ori_shape = (2, 10, 5)
-        self.repeat_times = (2, 2)
-
-
-class TestTileOpRank3(TestTileOpRank1):
-    def init_data(self):
-        self.ori_shape = (2, 4, 15)
-        self.repeat_times = (2, 1, 4)
-
-
-class TestTileOpRank4(TestTileOpRank1):
-    def init_data(self):
-        self.ori_shape = (2, 4, 5, 7)
-        self.repeat_times = (3, 2, 1, 2)
+class XPUTestTileOpRank1(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'tile'
+        self.use_dynamic_create_class = False
+
+    class TestTileOpRank1(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.__class__.no_need_check_grad = True
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "tile"
+            self.init_data()
+            self.inputs = {
+                'X': np.random.random(self.ori_shape).astype(self.dtype)
+            }
+            self.attrs = {'repeat_times': self.repeat_times}
+            output = np.tile(self.inputs['X'], self.repeat_times)
+            self.outputs = {'Out': output}
+
+        def init_data(self):
+            self.ori_shape = [100]
+            self.repeat_times = [2]
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+    #with dimension expanding
+    class TestTileOpRank2Expanding(TestTileOpRank1):
+        def init_data(self):
+            self.ori_shape = [120]
+            self.repeat_times = [2, 2]
+
+    class TestTileOpRank2(TestTileOpRank1):
+        def init_data(self):
+            self.ori_shape = [12, 14]
+            self.repeat_times = [2, 3]
+
+    class TestTileOpRank3_Corner(TestTileOpRank1):
+        def init_data(self):
+            self.ori_shape = (2, 10, 5)
+            self.repeat_times = (1, 1, 1)
+
+    class TestTileOpRank3_Corner2(TestTileOpRank1):
+        def init_data(self):
+            self.ori_shape = (2, 10, 5)
+            self.repeat_times = (2, 2)
+
+    class TestTileOpRank3(TestTileOpRank1):
+        def init_data(self):
+            self.ori_shape = (2, 4, 15)
+            self.repeat_times = (2, 1, 4)
+
+    class TestTileOpRank4(TestTileOpRank1):
+        def init_data(self):
+            self.ori_shape = (2, 4, 5, 7)
+            self.repeat_times = (3, 2, 1, 2)
 
 
 # Situation 2: repeat_times is a list (with tensor)
-class TestTileOpRank1_tensor_attr(XPUOpTest):
-    def setUp(self):
-        self.set_xpu()
-        self.place = paddle.XPUPlace(0)
-        self.op_type = "tile"
-        self.init_data()
-        repeat_times_tensor = []
-        for index, ele in enumerate(self.repeat_times):
-            repeat_times_tensor.append(("x" + str(index), np.ones(
-                (1)).astype('int32') * ele))
-
-        self.inputs = {
-            'X': np.random.random(self.ori_shape).astype("float32"),
-            'repeat_times_tensor': repeat_times_tensor,
-        }
-        self.attrs = {"repeat_times": self.infer_repeat_times}
-        output = np.tile(self.inputs['X'], self.repeat_times)
-        self.outputs = {'Out': output}
-
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def init_data(self):
-        self.ori_shape = [100]
-        self.repeat_times = [2]
-        self.infer_repeat_times = [-1]
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
-
-    def test_check_grad(self):
-        pass
-
-
-class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr):
-    def init_data(self):
-        self.ori_shape = [12, 14]
-        self.repeat_times = [1, 1]
-        self.infer_repeat_times = [1, -1]
-
-
-class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr):
-    def init_data(self):
-        self.ori_shape = [12, 14]
-        self.repeat_times = [2, 3]
-        self.infer_repeat_times = [-1, 3]
+class XPUTestTileOpRank1_tensor_attr(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'tile'
+        self.use_dynamic_create_class = False
+
+    class TestTileOpRank1_tensor_attr(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.__class__.no_need_check_grad = True
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "tile"
+            self.init_data()
+            repeat_times_tensor = []
+            for index, ele in enumerate(self.repeat_times):
+                repeat_times_tensor.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+
+            self.inputs = {
+                'X': np.random.random(self.ori_shape).astype(self.dtype),
+                'repeat_times_tensor': repeat_times_tensor,
+            }
+            self.attrs = {"repeat_times": self.infer_repeat_times}
+            output = np.tile(self.inputs['X'], self.repeat_times)
+            self.outputs = {'Out': output}
+
+        def init_data(self):
+            self.ori_shape = [100]
+            self.repeat_times = [2]
+            self.infer_repeat_times = [-1]
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+    class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr):
+        def init_data(self):
+            self.ori_shape = [12, 14]
+            self.repeat_times = [1, 1]
+            self.infer_repeat_times = [1, -1]
+
+    class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr):
+        def init_data(self):
+            self.ori_shape = [12, 14]
+            self.repeat_times = [2, 3]
+            self.infer_repeat_times = [-1, 3]
 
 
 # Situation 3: repeat_times is a tensor
-class TestTileOpRank1_tensor(XPUOpTest):
-    def setUp(self):
-        self.set_xpu()
-        self.place = paddle.XPUPlace(0)
-        self.op_type = "tile"
-        self.init_data()
-
-        self.inputs = {
-            'X': np.random.random(self.ori_shape).astype("float32"),
-            'RepeatTimes': np.array(self.repeat_times).astype("int32"),
-        }
-        self.attrs = {}
-        output = np.tile(self.inputs['X'], self.repeat_times)
-        self.outputs = {'Out': output}
-
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def init_data(self):
-        self.ori_shape = [100]
-        self.repeat_times = [2]
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
-
-    def test_check_grad(self):
-        pass
-
-
-class TestTileOpRank2_tensor(TestTileOpRank1_tensor):
-    def init_data(self):
-        self.ori_shape = [12, 14]
-        self.repeat_times = [2, 3]
-
-
-# Situation 4: input x is Integer
-class TestTileOpInteger(XPUOpTest):
-    def setUp(self):
-        self.set_xpu()
-        self.place = paddle.XPUPlace(0)
-        self.op_type = "tile"
-        self.inputs = {
-            'X': np.random.randint(
-                10, size=(4, 4, 5)).astype("int32")
-        }
-        self.attrs = {'repeat_times': [2, 1, 4]}
-        output = np.tile(self.inputs['X'], (2, 1, 4))
-        self.outputs = {'Out': output}
-
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
-
-
-# Situation 5: input x is Integer
-class TestTileOpInt64_t(XPUOpTest):
-    def setUp(self):
-        self.set_xpu()
-        self.place = paddle.XPUPlace(0)
-        self.op_type = "tile"
-        self.inputs = {
-            'X': np.random.randint(
-                10, size=(2, 4, 5)).astype("int64")
-        }
-        self.attrs = {'repeat_times': [2, 1, 4]}
-        output = np.tile(self.inputs['X'], (2, 1, 4))
-        self.outputs = {'Out': output}
-
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
-
-
-# Situation 6: input x is Bool
-class TestTileOpBool(XPUOpTest):
-    def setUp(self):
-        self.set_xpu()
-        self.place = paddle.XPUPlace(0)
-        self.op_type = "tile"
-        self.inputs = {
-            'X': np.random.randint(
-                10, size=(2, 4, 5)).astype("bool")
-        }
-        self.attrs = {'repeat_times': [2, 1, 4]}
-        output = np.tile(self.inputs['X'], (2, 1, 4))
-        self.outputs = {'Out': output}
-
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
+class XPUTestTileOpRank1_tensor(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'tile'
+        self.use_dynamic_create_class = False
+
+    class TestTileOpRank1_tensor(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.__class__.no_need_check_grad = True
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "tile"
+            self.init_data()
+
+            self.inputs = {
+                'X': np.random.random(self.ori_shape).astype(self.dtype),
+                'RepeatTimes': np.array(self.repeat_times).astype("int32"),
+            }
+            self.attrs = {}
+            output = np.tile(self.inputs['X'], self.repeat_times)
+            self.outputs = {'Out': output}
+
+        def init_data(self):
+            self.ori_shape = [100]
+            self.repeat_times = [2]
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+    class TestTileOpRank2_tensor(TestTileOpRank1_tensor):
+        def init_data(self):
+            self.ori_shape = [12, 14]
+            self.repeat_times = [2, 3]
+
+
+support_types = get_xpu_op_support_types('tile')
+for stype in support_types:
+    create_test_class(globals(), XPUTestTileOpRank1, stype)
+    create_test_class(globals(), XPUTestTileOpRank1_tensor_attr, stype)
+    create_test_class(globals(), XPUTestTileOpRank1_tensor, stype)
 
 
 # Test python API
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py
new file mode 100644
index 0000000000000..785549abba8f3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py
@@ -0,0 +1,143 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at #
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle.tensor as tensor
+import unittest
+import numpy as np
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from paddle.fluid.framework import Program, program_guard
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestTrilTriuOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'tril_triu'
+        self.use_dynamic_create_class = False
+
+    class TestTrilTriuOp(XPUOpTest):
+        def setUp(self):
+            self.init_dtype()
+            self.initTestCase()
+            self.real_op_type = np.random.choice(['triu', 'tril'])
+            self.real_np_op = getattr(np, self.real_op_type)
+            self.set_xpu()
+            self.op_type = "tril_triu"
+            if self.dtype == np.int32:
+                self.X = np.arange(
+                    1, self.get_Xshape_prod() + 1,
+                    dtype=self.dtype).reshape(self.Xshape)
+            else:
+                self.X = np.random.random(self.Xshape).astype(dtype=self.dtype)
+            self.inputs = {'X': self.X}
+            self.attrs = {
+                'diagonal': self.diagonal,
+                'lower': True if self.real_op_type == 'tril' else False,
+            }
+            self.outputs = {
+                'Out': self.real_np_op(self.X, self.diagonal)
+                if self.diagonal else self.real_np_op(self.X)
+            }
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def get_Xshape_prod(self):
+            ret = 1
+            for v in self.Xshape:
+                ret *= v
+            return ret
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+            self.__class__.no_need_check_grad = True
+            self.__class__.op_type = self.real_op_type
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place)
+
+        def initTestCase(self):
+            self.diagonal = None
+            self.Xshape = (10, 10)
+
+    class TestTrilTriuOp1(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = -3
+            self.Xshape = (5, 5)
+
+    class TestTrilTriuOp2(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = 4
+            self.Xshape = (11, 17)
+
+    class TestTrilTriuOp3(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = 10
+            self.Xshape = (25, 25)
+
+    class TestTrilTriuOp4(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = -10
+            self.Xshape = (33, 11)
+
+    class TestTrilTriuOp5(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = 11
+            self.Xshape = (1, 99)
+
+
+class TestTrilTriuOpError(unittest.TestCase):
+    def test_errors1(self):
+        paddle.enable_static()
+        data = fluid.data(shape=(20, 22), dtype='float32', name="data1")
+        op_type = np.random.choice(['triu', 'tril'])
+        errmsg = {
+            "diagonal: TypeError":
+            "diagonal in {} must be a python Int".format(op_type),
+        }
+        expected = list(errmsg.keys())[0]
+        with self.assertRaisesRegex(
+                eval(expected.split(':')[-1]), errmsg[expected]):
+            getattr(tensor, op_type)(x=data, diagonal='2022')
+
+    def test_errors2(self):
+        paddle.enable_static()
+        data = fluid.data(shape=(200, ), dtype='float32', name="data2")
+        op_type = np.random.choice(['triu', 'tril'])
+        errmsg = {
+            "input: ValueError":
+            "x shape in {} must be at least 2-D".format(op_type),
+        }
+        expected = list(errmsg.keys())[0]
+        with self.assertRaisesRegex(
+                eval(expected.split(':')[-1]), errmsg[expected]):
+            getattr(tensor, op_type)(x=data, diagonal=[None])
+
+
+support_types = get_xpu_op_support_types('tril_triu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestTrilTriuOp, stype)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 15d5640b11fe5..59e285c1200b8 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -68,8 +68,9 @@ def to_list(value):
 
 
 def to_numpy(var):
-    assert isinstance(var, (Variable, fluid.core.VarBase)), "not a variable"
-    if isinstance(var, fluid.core.VarBase):
+    assert isinstance(var, (Variable, fluid.core.VarBase,
+                            fluid.core.eager.Tensor)), "not a variable"
+    if isinstance(var, (fluid.core.VarBase, fluid.core.eager.Tensor)):
         return var.numpy()
     t = global_scope().find_var(var.name).get_tensor()
     return np.array(t)
diff --git a/python/paddle/incubate/multiprocessing/__init__.py b/python/paddle/incubate/multiprocessing/__init__.py
new file mode 100644
index 0000000000000..27c23be3a8941
--- /dev/null
+++ b/python/paddle/incubate/multiprocessing/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .reductions import init_reductions
+import multiprocessing
+
+__all__ = []
+
+from multiprocessing import *  # noqa: F403
+
+__all__ += multiprocessing.__all__  # type: ignore[attr-defined]
+
+# Only support linux for now
+# Only support file_system sharing strategy.
+
+init_reductions()
diff --git a/python/paddle/incubate/multiprocessing/reductions.py b/python/paddle/incubate/multiprocessing/reductions.py
new file mode 100644
index 0000000000000..cfbc55afd3bca
--- /dev/null
+++ b/python/paddle/incubate/multiprocessing/reductions.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+# TODO: check the hooks of tensor
+# TODO: check serializing named tensor
+# TODO: check influence on autograd
+import os
+import sys
+import warnings
+import math
+import copy
+import threading
+import multiprocessing
+from multiprocessing.util import register_after_fork
+from multiprocessing.reduction import ForkingPickler
+
+from collections import OrderedDict
+
+
+def _supported_check():
+    if sys.platform != "linux":
+        # warnings.warn("`paddle.multiprocessing` only support linux for now, "
+        #               " import this will not take any effect !")
+
+        return False
+
+    if not sys.version_info >= (3, 4):
+        warnings.warn("Use `paddle.multiprocessing` to share paddle tensor "
+                      "requires python version greater than 3.4 ."
+                      " `paddle.multiprocessing` will not take any effect !!!")
+        return False
+
+    return True
+
+
+class LRUSharedCache(OrderedDict):
+    def __init__(self):
+        self.limit = 128
+        self._after_fork()
+        register_after_fork(self, LRUSharedCache._after_fork)
+
+    def _after_fork(self):
+        self.lock = threading.Lock()
+
+    def get(self, key):
+        with self.lock:
+            try:
+                value = super().pop(key)
+                super().__setitem__(key, value)
+                return value
+            except KeyError:
+                return None
+
+    def __setitem__(self, key, value):
+        with self.lock:
+            try:
+                super().__delitem__(key)
+            except KeyError:
+                if len(self) >= self.limit:
+                    super().popitem(last=False)
+            super().__setitem__(key, value)
+
+
+shared_cache = LRUSharedCache()
+
+
+def cuda_from_cache(key):
+    lodtensor = shared_cache.get(key)
+    if lodtensor is None:
+        return None
+    return lodtensor
+
+
+def rebuild_tensor(cls, lodtensor, metadata):
+    if cls == paddle.fluid.framework.ParamBase:
+        tensor = paddle.fluid.framework.ParamBase(lodtensor.shape(),
+                                                  lodtensor._dtype(),
+                                                  **metadata)
+        tensor.value().get_tensor()._share_data_with(lodtensor)
+    else:
+        size, stop_gradient = metadata
+        tensor = paddle.fluid.core.VarBase()
+        if lodtensor._is_initialized():
+            tensor.value().get_tensor()._share_data_with(lodtensor)
+        else:
+            tensor = paddle.to_tensor([], dtype=lodtensor._dtype())
+        tensor.stop_gradient = stop_gradient
+    return tensor
+
+
+def reduce_tensor(tensor):
+    lodtensor = tensor.value().get_tensor()
+
+    if not tensor.stop_gradient and not tensor.is_leaf:
+        raise RuntimeError(
+            "Refusing to serialize non-leaf tensor which not stop_gradient, you can detach it!"
+        )
+    # TODO: add serializing name and  hooks check
+    if tensor.place.is_cpu_place() or tensor.place.is_gpu_place(
+    ) or tensor.place.is_cuda_pinned_place():
+        if type(tensor) == paddle.fluid.framework.ParamBase:
+            metadata = copy.deepcopy(tensor.__dict__)
+        else:
+            metadata = (tensor.size, tensor.stop_gradient)
+
+        return (rebuild_tensor, (type(tensor), lodtensor, metadata))
+    else:
+        raise ValueError(
+            "Only support tensors of CPU/CUDA/CUDAPinned Place, Not support %s for now!"
+            % tensor.place)
+
+
+def rebuild_lodtensor_filename(cls, ipc_name, size, type_idx, dims, lod):
+    lodtensor = cls._new_shared_filename((ipc_name, size, type_idx, dims, lod))
+    lodtensor._shared_decref()
+    return lodtensor
+
+
+def rebuild_cuda_tensor(cls, handle, offset_bytes, size, type_idx, dims, lod,
+                        device_idx):
+    cache_tensor = cuda_from_cache((handle, offset_bytes))
+    if cache_tensor is None:
+        lodtensor = cls._new_shared_cuda(
+            (handle, offset_bytes, size, type_idx, dims, lod, device_idx))
+        # We only cache cuda shared tensor here.
+        # The opening cost of cudaIpcMemoryHandle is very high.
+        # Since we cache the recived tensor directly,
+        # The sender may reallocate the tensor space,
+        # you should manualy maintian the lifecycle of ipc tensor
+        shared_cache[(handle, offset_bytes)] = lodtensor
+    else:
+        lodtensor = paddle.fluid.core.LoDTensor()
+        lodtensor._share_buffer_with(cache_tensor,
+                                     (size, type_idx, dims, lod, device_idx))
+
+    return lodtensor
+
+
+def rebuild_lodtensor_empty(cls):
+    #TODO: check if tensor initialized
+    #TODO: handle the dtype of empty tensor
+    return cls()
+
+
+def reduce_lodtensor(lodtensor):
+    if lodtensor._place().is_cpu_place() or lodtensor._place(
+    ).is_cuda_pinned_place():
+        for dim in lodtensor.shape():
+            if dim == 0:
+                # Empty tensors have nothing be mmapped.
+                return (rebuild_lodtensor_empty, (type(lodtensor), ))
+
+        # Default use share filename stratege
+        metadata = lodtensor._share_filename(
+        )  # ipc_name, size, type_idx, dims, lod
+        rebuild = rebuild_lodtensor_filename
+        lodtensor._shared_incref()
+        # TODO, maintain reference for lodtensor
+        # TODO: support file_discriptor stratege
+    elif lodtensor._place().is_gpu_place():
+        metadata = lodtensor._share_cuda()
+        rebuild = rebuild_cuda_tensor
+    else:
+        raise RuntimeError("We only support pass cpu/gpu lodtensor for now!")
+
+    return (rebuild, (type(lodtensor), ) + metadata)
+
+
+def init_reductions():
+    if not _supported_check():
+        return
+
+    ForkingPickler.register(paddle.Tensor, reduce_tensor)
+    ForkingPickler.register(paddle.fluid.core.VarBase, reduce_tensor)
+    ForkingPickler.register(paddle.fluid.framework.ParamBase, reduce_tensor)
+    ForkingPickler.register(paddle.fluid.core.LoDTensor, reduce_lodtensor)
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index d600cda8454cc..457422ae3a4d6 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -223,12 +223,14 @@ def fused_multi_head_attention(x,
                                pre_ln_epsilon=1e-05,
                                qkv_bias=None,
                                linear_bias=None,
+                               cache_kv=None,
                                attn_mask=None,
                                dropout_rate=0.5,
                                attn_dropout_rate=0.5,
                                ln_epsilon=1e-05,
                                training=True,
                                mode='upscale_in_train',
+                               ring_id=-1,
                                name=None):
     r"""
     Attention mapps queries and a set of key-value pairs to outputs, and
@@ -242,8 +244,8 @@ def fused_multi_head_attention(x,
     	    out = layer_norm(x)
             out = linear(out) + qkv) + bias
     	else:
-	    out = linear(x) + bias
-    	out = transpose(out, perm=[2, 0, 3, 1, 4])
+            out = linear(x) + bias
+            out = transpose(out, perm=[2, 0, 3, 1, 4])
     	# extract q, k and v from out.
     	q = out[0:1,::]
     	k = out[1:2,::]
@@ -257,8 +259,8 @@ def fused_multi_head_attention(x,
     	out = out_linear(out)
     	if pre_layer_norm:
     	    out = x + dropout(linear_bias + out)
-	else:
-    	    out = layer_norm(x + dropout(linear_bias + out))
+        else:
+            out = layer_norm(x + dropout(linear_bias + out))
 
     Parameters:
         x (Tensor): The input tensor of fused_multi_head_attention. The shape is
@@ -276,6 +278,7 @@ def fused_multi_head_attention(x,
         qkv_bias (Tensor, optional): The bias of qkv computation. The shape is `[3, num_head, dim_head]`.
             Default None.
         linear_bias (Tensor, optional): The bias of linear. The shape is `[embed_dim]`. Default None.
+        cache_kv (Tensor, optional): For generation model, cache structure. The shape is `[2, bsz, num_head, seq_len, head_dim]`. Default None.
         attn_mask (Tensor, optional):  A tensor used in multi-head attention to prevents attention to
  	    some unwanted positions, usually the paddings or the subsequent positions. It is a tensor
             with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the
@@ -303,6 +306,7 @@ def fused_multi_head_attention(x,
 
                                   - train: out = input * mask
                                   - inference: out = input * (1.0 - p)
+        ring_id (int, optional): For distributed forward in mp, only support NCCL and forward. Default is -1, means not using mp
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -333,7 +337,7 @@ def fused_multi_head_attention(x,
             output = F.fused_multi_head_attention(
                 x, qkv_weight, linear_weight, False,
                 None, None, None, None, 1e-5, qkv_bias,
-                linear_bias, attn_mask)
+                linear_bias, None, attn_mask)
             # [2, 4, 128]
             print(output.shape)
     """
@@ -359,17 +363,20 @@ def fused_multi_head_attention(x,
         assert qkv_weight.shape[1] * qkv_weight.shape[2] == qkv_weight.shape[
             3], "embed_dim must be divisible by num_heads."
 
-        _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, final_out = _C_ops.fused_attention(
-            x, pre_ln_scale, pre_ln_bias, qkv_weight, qkv_bias, attn_mask,
-            linear_weight, linear_bias, ln_scale, ln_bias, 'pre_layer_norm',
-            pre_layer_norm, 'epsilon', pre_ln_epsilon, 'dropout_rate',
-            dropout_rate, 'attn_dropout_rate', attn_dropout_rate, 'ln_epsilon',
-            ln_epsilon, 'attn_dropout_is_test', not training, 'dropout_is_test',
-            not training, 'attn_dropout_fix_seed', seed is not None,
-            'dropout_fix_seed', seed is not None, 'attn_dropout_seed', seed
+        _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, cache_kv_out, final_out = _C_ops.fused_attention(
+            x, pre_ln_scale, pre_ln_bias, qkv_weight, qkv_bias, cache_kv,
+            attn_mask, linear_weight, linear_bias, ln_scale, ln_bias,
+            'pre_layer_norm', pre_layer_norm, 'epsilon', pre_ln_epsilon,
+            'dropout_rate', dropout_rate, 'attn_dropout_rate',
+            attn_dropout_rate, 'ln_epsilon', ln_epsilon, 'attn_dropout_is_test',
+            not training, 'dropout_is_test', not training,
+            'attn_dropout_fix_seed', seed is not None, 'dropout_fix_seed',
+            seed is not None, 'attn_dropout_seed', seed
             if seed is not None else 0, 'dropout_seed', seed
             if seed is not None else 0, 'attn_dropout_implementation', mode,
-            'dropout_implementation', mode)
+            'dropout_implementation', mode, 'ring_id', ring_id)
+        if cache_kv is not None:
+            return final_out, cache_kv_out
         return final_out
     else:
         helper = LayerHelper('fused_multi_head_attention', **locals())
@@ -398,6 +405,7 @@ def fused_multi_head_attention(x,
             inputs['Ln2Scale'] = [ln_scale]
         if ln_bias:
             inputs['Ln2Bias'] = [ln_bias]
+        if cache_kv: inputs['CacheKV'] = [cache_kv]
 
         if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
             seed = helper.main_program.random_seed
@@ -417,6 +425,7 @@ def fused_multi_head_attention(x,
             'dropout_seed': seed if seed is not None else 0,
             'attn_dropout_implementation': mode,
             'dropout_implementation': mode,
+            'ring_id': ring_id
         }
 
         # set outputs
@@ -449,6 +458,7 @@ def fused_multi_head_attention(x,
         bias_dropout_residual_out = helper.create_variable_for_type_inference(
             dtype=dtype)
         final_out = helper.create_variable_for_type_inference(dtype=dtype)
+        cache_kv_out = helper.create_variable_for_type_inference(dtype=dtype)
 
         helper.append_op(
             type='fused_attention',
@@ -472,7 +482,9 @@ def fused_multi_head_attention(x,
                 "Ln2Mean": ln_mean_out,
                 "Ln2Variance": ln_variance_out,
                 "BiasDropoutResidualOut": bias_dropout_residual_out,
-                'Y': final_out
+                'Y': final_out,
+                'CacheKVOut': cache_kv_out
             },
             attrs=attrs)
-        return final_out
+
+        return (final_out, cache_kv_out) if cache_kv else final_out
diff --git a/python/paddle/incubate/tensor/math.py b/python/paddle/incubate/tensor/math.py
index 9f577d5ff3802..2d0b079ee9280 100644
--- a/python/paddle/incubate/tensor/math.py
+++ b/python/paddle/incubate/tensor/math.py
@@ -29,7 +29,7 @@ def segment_sum(data, segment_ids, name=None):
     where sum is over j such that `segment_ids[j] == i`.
 
     Args:
-        data (Tensor): A tensor, available data type float32, float64.
+        data (Tensor): A tensor, available data type float32, float64, int32, int64.
         segment_ids (Tensor): A 1-D tensor, which have the same size
                             with the first dimension of input data. 
                             Available data type is int32, int64.
@@ -54,7 +54,8 @@ def segment_sum(data, segment_ids, name=None):
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "SUM")
         return out
 
-    check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool")
+    check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
+                                         "int64"), "segment_pool")
     check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
                              "segment_pool")
 
@@ -82,7 +83,7 @@ def segment_mean(data, segment_ids, name=None):
     of all index 'segment_ids[j] == i'.
 
     Args:
-        data (tensor): a tensor, available data type float32, float64.
+        data (tensor): a tensor, available data type float32, float64, int32, int64.
         segment_ids (tensor): a 1-d tensor, which have the same size 
                             with the first dimension of input data. 
                             available data type is int32, int64.
@@ -107,7 +108,8 @@ def segment_mean(data, segment_ids, name=None):
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MEAN")
         return out
 
-    check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool")
+    check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
+                                         "int64"), "segment_pool")
     check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
                              "segment_pool")
 
@@ -134,7 +136,7 @@ def segment_min(data, segment_ids, name=None):
     where min is over j such that `segment_ids[j] == i`.
 
     Args:
-        data (tensor): a tensor, available data type float32, float64.
+        data (tensor): a tensor, available data type float32, float64, int32, int64.
         segment_ids (tensor): a 1-d tensor, which have the same size
                             with the first dimension of input data. 
                             available data type is int32, int64.
@@ -159,7 +161,8 @@ def segment_min(data, segment_ids, name=None):
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MIN")
         return out
 
-    check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool")
+    check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
+                                         "int64"), "segment_pool")
     check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
                              "segment_pool")
 
@@ -186,7 +189,7 @@ def segment_max(data, segment_ids, name=None):
     where max is over j such that `segment_ids[j] == i`.
 
     Args:
-        data (tensor): a tensor, available data type float32, float64.
+        data (tensor): a tensor, available data type float32, float64, int32, int64.
         segment_ids (tensor): a 1-d tensor, which have the same size
                             with the first dimension of input data. 
                             available data type is int32, int64.
@@ -211,7 +214,8 @@ def segment_max(data, segment_ids, name=None):
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MAX")
         return out
 
-    check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool")
+    check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
+                                         "int64"), "segment_pool")
     check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
                              "segment_pool")
 
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index d75c95b437201..ef62aa264fb26 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -282,7 +282,7 @@ def update(self, correct, *args):
         Return:
             Tensor: the accuracy of current step.
         """
-        if isinstance(correct, paddle.Tensor):
+        if isinstance(correct, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             correct = correct.numpy()
         num_samples = np.prod(np.array(correct.shape[:-1]))
         accs = []
@@ -410,12 +410,12 @@ def update(self, preds, labels):
                 the shape should keep the same as preds.
                 The data type is 'int32' or 'int64'.
         """
-        if isinstance(preds, paddle.Tensor):
+        if isinstance(preds, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             preds = preds.numpy()
         elif not _is_numpy_(preds):
             raise ValueError("The 'preds' must be a numpy ndarray or Tensor.")
 
-        if isinstance(labels, paddle.Tensor):
+        if isinstance(labels, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             labels = labels.numpy()
         elif not _is_numpy_(labels):
             raise ValueError("The 'labels' must be a numpy ndarray or Tensor.")
@@ -543,12 +543,12 @@ def update(self, preds, labels):
                 the shape should keep the same as preds.
                 Shape: [batch_size, 1], Dtype: 'int32' or 'int64'.
         """
-        if isinstance(preds, paddle.Tensor):
+        if isinstance(preds, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             preds = preds.numpy()
         elif not _is_numpy_(preds):
             raise ValueError("The 'preds' must be a numpy ndarray or Tensor.")
 
-        if isinstance(labels, paddle.Tensor):
+        if isinstance(labels, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             labels = labels.numpy()
         elif not _is_numpy_(labels):
             raise ValueError("The 'labels' must be a numpy ndarray or Tensor.")
@@ -698,12 +698,12 @@ def update(self, preds, labels):
                 (batch_size, 1), labels[i] is either o or 1,
                 representing the label of the instance i.
         """
-        if isinstance(labels, paddle.Tensor):
+        if isinstance(labels, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             labels = labels.numpy()
         elif not _is_numpy_(labels):
             raise ValueError("The 'labels' must be a numpy ndarray or Tensor.")
 
-        if isinstance(preds, paddle.Tensor):
+        if isinstance(preds, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             preds = preds.numpy()
         elif not _is_numpy_(preds):
             raise ValueError("The 'preds' must be a numpy ndarray or Tensor.")
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index de8a7ff6d3c7b..4c30ed03735f2 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -19,6 +19,7 @@
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 from paddle import _C_ops
 from paddle import in_dynamic_mode
+from paddle.framework import _in_eager_mode
 
 __all__ = []
 
@@ -87,6 +88,8 @@ def one_hot(x, num_classes, name=None):
     """
 
     if in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_one_hot(x, num_classes)
         return _C_ops.one_hot_v2(x, 'depth', num_classes, 'allow_out_of_range',
                                  False)
     else:
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index e6efde836284a..10d4073b80c59 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -36,7 +36,7 @@
 from paddle.utils import deprecated
 from paddle import _C_ops
 from paddle import in_dynamic_mode
-from paddle.framework import core
+from paddle.framework import core, _in_eager_mode
 
 __all__ = []
 
@@ -114,7 +114,10 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
             reduction)
 
     if in_dynamic_mode():
-        out = _C_ops.bce_loss(input, label)
+        if _in_eager_mode():
+            out = _C_ops.final_state_bce_loss(input, label)
+        else:
+            out = _C_ops.bce_loss(input, label)
         if weight is not None:
             out = _C_ops.elementwise_mul(out, weight, 'axis', -1)
 
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 47dc02705f80b..96f35eb9d27ec 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -42,6 +42,7 @@
 from .lr import LRScheduler
 import copy
 from paddle import _C_ops
+from paddle.fluid.framework import _in_eager_mode
 
 __all__ = []
 
@@ -1108,7 +1109,13 @@ def clear_grad(self, set_to_zero=True):
                 for p in param_group['params']:
                     if not p.stop_gradient:
                         param_list.append(p)
-        core.clear_gradients(param_list, set_to_zero)
+
+        if _in_eager_mode():
+            for p in param_list:
+                clear_func = p._zero_grads if set_to_zero else p.clear_gradient
+                clear_func()
+        else:
+            core.clear_gradients(param_list, set_to_zero)
 
     @imperative_base.no_grad
     def minimize(self,
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
index 5167c18de179d..6c575b4b997d6 100644
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -59,16 +59,14 @@ class SGD(Optimizer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+
+            inp = paddle.uniform(min=-0.1, max=0.1, shape=[10, 10], dtype='float32')
             linear = paddle.nn.Linear(10, 10)
             inp = paddle.to_tensor(inp)
             out = linear(inp)
             loss = paddle.mean(out)
-            beta1 = paddle.to_tensor([0.9], dtype="float32")
-            beta2 = paddle.to_tensor([0.99], dtype="float32")
             sgd = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
-            back = out.backward()
+            out.backward()
             sgd.step()
             sgd.clear_grad()
 
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
index f06c45cc36973..7c0c71951aa1d 100644
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
@@ -193,7 +193,7 @@ def from_tensor(cls, tensor, name=None):
                 print(x_spec)  # InputSpec(shape=(2, 2), dtype=VarType.FP32, name=x)
 
         """
-        if isinstance(tensor, (Variable, core.VarBase)):
+        if isinstance(tensor, (Variable, core.VarBase, core.eager.Tensor)):
             return cls(tensor.shape, tensor.dtype, name or tensor.name)
         else:
             raise ValueError(
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 6555ba0812d08..bdb0eabe2bbb2 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -974,6 +974,8 @@ def diag(x, offset=0, padding_value=0, name=None):
           # [4]
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_diag(x, offset, padding_value)
         return _C_ops.diag_v2(x, "offset", offset, "padding_value",
                               padding_value)
 
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index 040480c26faa8..06c2a82fd696d 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 
 import itertools
+import numpy as np
 import re
 
-from .linalg import matmul, transpose
+from .linalg import dot, matmul, transpose
 from .manipulation import squeeze, unsqueeze, reshape
 from .math import multiply
 from .math import sum as paddle_sum
@@ -111,36 +112,6 @@ def validate_rhs(rhs, input_labels, n_bcast_dims):
         f"Invalid equation: duplicate output labels are found.")
 
 
-#     '''
-#     Tests if the two operands can perform a broadcast operation on the given ranges of dimensions. 
-#     We follow the Numpy broadcasting convention which states that, by lining up the shape arrays
-#     starting from the right most dimension, all the aligned dimensions either have equal sizes or
-#     one of them is sized one.
-#     Parameters
-#     ----------
-#     args:
-#         *args unpacks into operand one's axes range, shape, operand two's axes range, shape
-#     f: 
-#         if available, is used as a callback for postprocessing the aligned operand dimensions.
-#     '''
-#     xran, xshape, yran, yshape = args
-#
-#     xran_inv, yran_inv = xran[::-1], yran[::-1]
-#
-#     for xi, yi in zip(xran_inv, yran_inv):
-#         xs, ys = xshape[xi], yshape[yi]
-#         cond = xs == ys or xs == 1 or ys == 1
-#         if not cond:
-#             return False
-#
-#     if not f:
-#         return True
-#
-#     # Apply the callback to each aligned dimension pair
-#     for xi, yi in zip(xran_inv, yran_inv):
-#         f(xi, yi)
-
-
 def build_view(in_labels, out_labels):
     '''
     Build an inverse map of dimension indices. Three conditions must hold for 
@@ -291,39 +262,12 @@ def build_global_shape(g_view, g_labels, op_shapes):
 
     g_shape = [sizes.pop() if len(sizes) > 0 else 1 for sizes in g_shape]
 
-    g_masks = [[s > 1 for s in view_shape] for view_shape in view_shapes]
+    g_masks = [[s > 1 or s == -1 for s in view_shape]
+               for view_shape in view_shapes]
 
     return g_shape, g_masks
 
 
-def dim_strides(shape):
-    '''
-    Returns the dimension strides for a tensor shape
-    '''
-    strides = []
-    stride = 1
-    for size in shape[::-1]:
-        strides.append(stride)
-        stride = stride * size
-    return strides
-
-
-def create_view(operand, *view_def):
-    '''
-    Create and materialize a view.
-    
-    Parameters
-    ----------
-    operand:
-        the base tensor operand
-    view_def: 
-        include two lists which define the view's dimension sizes and strides
-    '''
-    assert False, f'Diagonal and trace not implemented yet.'
-    view_shape, view_strides = view_def
-    return operand.create_view(view_shape, view_strides)
-
-
 def has_duplicated_labels(labels):
     '''
     Returns True if there is any duplicate label.
@@ -337,46 +281,17 @@ def diagonalize(labels, operand):
     Merges dimensions with duplicate labels. 
     
     For those dimensions with duplicate labels, merge them into one dimension
-    which represents the diagonal elements. That requires the duplicate labeled
-    dimensions equal sized. The order of dimensions is kept unchanged up to 
-    the left-most appearance of each label.
+    which represents the diagonal elements. This requires the dimensions with
+    duplicate labels are equal sized.
     
     Examples
     -------- 
     'ijj...i' would be merged into 'ij...'
     '''
-    if not has_duplicated_labels(labels):
-        return labels, operand
-
-    strides = dim_strides(operand.shape)
-    shape = operand.shape
-    new_labels = []
-    new_shape = []
-    new_strides = []
-
-    for ax, l in enumerate(labels):
-        if l == '.' or l not in new_labels:
-            # not duplicate
-            new_labels.append(l)
-            new_strides.append(strides[ax])
-            new_shape.append(shape[ax])
-        else:
-            # duplicate label
-            diag_ax = new_labels.index(l)
-            new_strides[diag_ax] += strides[ax]
+    assert not has_duplicated_labels(labels), (
+        f'Duplicate labels are not supported.')
 
-    # Call framework API to build a new tensor
-    new_op = create_view(operand, new_shape, new_strides)
-    return new_labels, new_op
-
-
-def prod(iter, default=1):
-    if len(iter):
-        res = 1
-        for s in iter:
-            res *= s
-        return res
-    return default
+    return labels, operand
 
 
 def plan_reduce(plan, op, reduce_dims, keepdim):
@@ -408,102 +323,108 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
 
     op1_view, op2_view = [g_view[op] for op in (op1, op2)]
 
-    # Note, I may index into -1
-    I1_dims = [op1_view[ax] for ax in I if op1_view[ax] >= 0]
-    I2_dims = [op2_view[ax] for ax in I if op2_view[ax] >= 0]
-    J1_dims = [op1_view[ax] for ax in J1]
-    J2_dims = [op2_view[ax] for ax in J2]
-    K1_dims = [op1_view[ax] for ax in K]
-    K2_dims = [op2_view[ax] for ax in K]
+    I1 = [idx for idx in I if op1_view[idx] >= 0]
+    I2 = [idx for idx in I if op2_view[idx] >= 0]
+    op1_view = np.array(op1_view)
+    op1_dims = op1_view[I1 + J1 + K]
 
-    op1_mask, op2_mask = [g_supports[op] for op in (op1, op2)]
-    op1_vshape = [s if m else 1 for s, m in zip(g_shape, op1_mask)]
-    op2_vshape = [s if m else 1 for s, m in zip(g_shape, op2_mask)]
-
-    I1_shape, J1_shape, K1_shape = [[op1_vshape[ax] for ax in axes]
-                                    for axes in (I, J1, K)]
-    I2_shape, J2_shape, K2_shape = [[op2_vshape[ax] for ax in axes]
-                                    for axes in (I, J2, K)]
+    op2_view = np.array(op2_view)
+    op2_dims = op2_view[I2 + J2 + K]
 
-    K1_size, J1_size, J2_size = prod(K1_shape), prod(J1_shape), prod(J2_shape)
+    op1_mask, op2_mask = [g_supports[op] for op in (op1, op2)]
+    op1_vshape = np.array([s if m else 1 for s, m in zip(g_shape, op1_mask)])
+    op2_vshape = np.array([s if m else 1 for s, m in zip(g_shape, op2_mask)])
+    vshape = np.maximum(op1_vshape, op2_vshape)
 
-    perm1 = I1_dims + J1_dims + K1_dims
-    perm2 = I2_dims + J2_dims + K2_dims
+    i1, i2, j1, j2, k = map(len, (I1, I2, J1, J2, K))
 
-    if any(i != dim for i, dim in enumerate(perm1)):
+    if any(op1_dims != np.arange(len(op1_dims))):
         # print(f'perm1: {perm1}')
-        step = transpose, [var1], var1, perm1
+        step = transpose, [var1], var1, list(op1_dims)
         plan.add_step(step)
 
-    if any(i != dim for i, dim in enumerate(perm2)):
+    if any(op2_dims != np.arange(len(op2_dims))):
         # print(f'perm2: {perm2}')
-        step = transpose, [var2], var2, perm2
+        step = transpose, [var2], var2, list(op2_dims)
         plan.add_step(step)
 
-    # In case of no K... dimensions, do a broadcast
-    if not K:
-        # unsqueeze operands include J1...J2... dimensions
-        if J2:
-            fill_start = len(I2_dims) + len(J1)
-            fill_end = fill_start + len(J2)
-            fill = list(range(fill_start, fill_end))
-            step = unsqueeze, [var1], var1, fill
-            plan.add_step(step)
-        if J1:
-            fill_start = len(I2_dims)
-            fill_end = fill_start + len(J1)
-            fill = list(range(fill_start, fill_end))
-            step = unsqueeze, [var2], var2, fill
-            plan.add_step(step)
-        # make broadcast
-        step = multiply, [var1, var2], var2
-        plan.add_step(step)
-    # K... are there, let's reason about I... and J...
-    # In case I... and J... are empty, do the vector-vector version of matmul
-    elif not I and not J1 and not J2:
-        # merge K dimensions
-        if len(K) > 1:
-            for var in var1, var2:
-                step = reshape, [var], var, [K1_size]
-                plan.add_step(step)
-        # Build vector-vector matmul
-        step = matmul, [var1, var2], var2
-        plan.add_step(step)
-    # General case, there are K... and some I... and J..., the actual operation will be 
-    # matrix-vector or matrix-matrix multiplies, depending on the operands' shapes.
-    else:
-        # Merge J dims and K dims by reshaping
-        merged_shape1 = I1_shape + [J1_size] + [K1_size]
-        merged_shape2 = I2_shape + [J2_size] + [K1_size]
+    # Check if conditions hold for turnning the operation into a matmul
+    if j1 + j2 > 0 and k > 0 and -1 not in np.concatenate(
+        (op1_vshape, op2_vshape)):
+        op1_shape = list(op1_vshape[I]) + [np.prod(op1_vshape[J1])
+                                           ] + [np.prod(op1_vshape[K])]
+        op2_shape = list(op2_vshape[I]) + [np.prod(op2_vshape[J2])
+                                           ] + [np.prod(op2_vshape[K])]
 
-        step = reshape, [var1], var1, merged_shape1
+        # Merge J dims and K dims by reshaping
+        step = reshape, [var1], var1, op1_shape
         plan.add_step(step)
-        step = reshape, [var2], var2, merged_shape2
+        step = reshape, [var2], var2, op2_shape
         plan.add_step(step)
 
         # Matmul
         step = matmul, [var1, var2], var2, False, True
         plan.add_step(step)
 
-    # The result shape is in I..., J1, J2. Let's reshape back to known dimensions
-    # Note, this is static deduction, not by reading the tensor shape at runtime
-    result_shape = [1] * len(I)
-    for i, ax in enumerate(I):
-        result_shape[i] = max(op1_vshape[ax], op2_vshape[ax])
-    if J1:
-        result_shape += J1_shape
-    if J2:
-        result_shape += J2_shape
-
-    # Need a scalar dimension somehow
-    if result_shape:
-        step = reshape, [var2], var2, result_shape
+        # Reshape back
+        shape = list(vshape[I + J1 + J2])
+        step = reshape, [var2], var2, shape
         plan.add_step(step)
 
+    elif j1 == j2 == k == 1:
+        # Can still do matmul even unknown shapes are present
+        step = matmul, [var1, var2], var2, False, True
+        plan.add_step(step)
+
+    # In the rest cases we opt for ops other than matmul 
+    else:
+        # unsqueeze operands include J1...J2... dimensions
+        if j2:
+            fill = list(range(i1 + j1, i1 + j1 + j2))
+            step = unsqueeze, [var1], var1, fill
+            plan.add_step(step)
+        if j1:
+            fill = list(range(i2, i2 + j1))
+            step = unsqueeze, [var2], var2, fill
+            plan.add_step(step)
+        # In case of no dimensions to contract, do an elementwise multiply
+        if k == 0:
+            # make broadcast
+            step = multiply, [var1, var2], var2
+            plan.add_step(step)
+        # Contract and no join, turn into a dot
+        elif j1 + j2 == 0 and k == 1:
+            step = unsqueeze, [var1], var1, [-2]
+            plan.add_step(step)
+            step = unsqueeze, [var2], var2, [-1]
+            plan.add_step(step)
+            step = matmul, [var1, var2], var2
+            plan.add_step(step)
+            step = squeeze, [var2], var2, [-1, -2]
+            plan.add_step(step)
+        elif j1 + j2 == 0 and not-1 in np.concatenate(
+            (op1_vshape[K], op2_vshape[K])):
+            assert all(op1_vshape[K] == op2_vshape[K])
+            step = reshape, [var1], var1, list(op1_vshape[
+                I]) + [1] + [np.prod(op1_vshape[K])]
+            plan.add_step(step)
+            step = reshape, [var2], var2, list(op2_vshape[
+                I]) + [1] + [np.prod(op2_vshape[K])]
+            plan.add_step(step)
+            step = matmul, [var1, var2], var2, False, True
+            plan.add_step(step)
+            step = squeeze, [var2], var2, [-1, -2]
+            plan.add_step(step)
+        else:
+            step = multiply, [var1, var2], var2
+            plan.add_step(step)
+            reduce_dims = list(range(-k, 0))
+            plan_reduce(plan, op2, reduce_dims, keepdim=False)
+
     # Wrap up, updating auxiliary data
     # Updating g_mask for I and J axes
-    for i, ax in enumerate(I + J1 + J2):
-        op2_mask[ax] = (result_shape[i] > 1)
+    for ax in I + J1 + J2:
+        op2_mask[ax] = vshape[ax] > 1 or vshape[ax] == -1
 
     for ax in K:
         op2_mask[ax] = False
@@ -514,6 +435,8 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
     for ax in I + J1 + J2:
         op2_view[ax], dim = dim, dim + 1
 
+    g_view[op2] = list(op2_view)
+
 
 def plan_summation(plan, g_view, op1, op2, g_supports, g_shape, g_count,
                    n_bcast):
@@ -737,7 +660,6 @@ def plan_einsum(operands, g_view, g_shape, g_supports, g_count, n_bcast):
     return plan
 
 
-@dygraph_only
 def einsum(equation, *operands):
     r"""
     einsum(equation, *operands)
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index fef1652040835..1a0e636124dbf 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -14,7 +14,7 @@
 
 import numpy as np
 from ..fluid.layer_helper import LayerHelper
-from ..framework import _varbase_creator, _dygraph_tracer
+from ..framework import _varbase_creator, _dygraph_tracer, _in_eager_mode
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
 from ..static import Variable
 
@@ -1146,6 +1146,8 @@ def cross(x, y, axis=None, name=None):
             #  [0. 0. 0.]]
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_cross(x, y, axis)
         if axis is not None:
             return _C_ops.cross(x, y, 'dim', axis)
         else:
@@ -1490,6 +1492,8 @@ def mv(x, vec, name=None):
             out = paddle.mv(x, vec)
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_mv(x, vec)
         out = _C_ops.mv(x, vec)
         return out
 
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 858f9139231e7..aa2d2e161181b 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -17,6 +17,7 @@
 from ..fluid.layers.layer_function_generator import templatedoc
 from ..static import Variable
 from ..framework import VarBase as Tensor
+from ..framework import _in_eager_mode
 
 # TODO: define logic functions of a tensor  
 from ..fluid.layers import is_empty  # noqa: F401
@@ -181,6 +182,9 @@ def equal(x, y, name=None):
         y = full(shape=[1], dtype=x.dtype, fill_value=y)
 
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_equal(x, y)
+
         return _C_ops.equal(x, y)
 
     check_variable_and_dtype(
@@ -223,6 +227,9 @@ def greater_equal(x, y, name=None):
             print(result1)  # result1 = [True False True]
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_greater_equal(x, y)
+
         return _C_ops.greater_equal(x, y)
 
     check_variable_and_dtype(x, "x",
@@ -269,6 +276,9 @@ def greater_than(x, y, name=None):
             print(result1)  # result1 = [False False True]
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_greater_than(x, y)
+
         return _C_ops.greater_than(x, y)
 
     check_variable_and_dtype(x, "x",
@@ -316,6 +326,9 @@ def less_equal(x, y, name=None):
             print(result1)  # result1 = [True True False]
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_less_equal(x, y)
+
         return _C_ops.less_equal(x, y)
 
     check_variable_and_dtype(
@@ -359,6 +372,9 @@ def less_than(x, y, name=None):
             print(result1)  # result1 = [False True False]
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_less_than(x, y)
+
         return _C_ops.less_than(x, y)
 
     check_variable_and_dtype(
@@ -402,6 +418,9 @@ def not_equal(x, y, name=None):
             print(result1)  # result1 = [False True True]
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_not_equal(x, y)
+
         return _C_ops.not_equal(x, y)
 
     check_variable_and_dtype(
@@ -443,7 +462,7 @@ def is_tensor(x):
             print(check)  #False
             
     """
-    return isinstance(x, Tensor)
+    return isinstance(x, (Tensor, paddle.fluid.core.eager.Tensor))
 
 
 def _bitwise_op(op_name, x, y, out=None, name=None, binary_op=True):
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 32ccecbc6d9f0..e530bfd8536a4 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -16,7 +16,7 @@
 from collections import Counter
 
 from ..static import Variable, device_guard
-from ..framework import core
+from ..framework import core, _in_eager_mode
 from ..fluid.layer_helper import LayerHelper
 from ..framework import OpProtoHolder, convert_np_dtype_to_dtype_, dygraph_only
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
@@ -263,6 +263,9 @@ def fill_diagonal_tensor(x, y, offset=0, dim1=0, dim2=1, name=None):
 
 setattr(core.VarBase, 'fill_diagonal_tensor', fill_diagonal_tensor)
 
+if core._in_eager_mode():
+    setattr(core.eager.Tensor, 'fill_diagonal_tensor', fill_diagonal_tensor)
+
 
 @dygraph_only
 def tolist(x):
@@ -889,12 +892,20 @@ def stack(x, axis=0, name=None):
             x1 = paddle.to_tensor([[1.0, 2.0]])
             x2 = paddle.to_tensor([[3.0, 4.0]])
             x3 = paddle.to_tensor([[5.0, 6.0]])
+	    
             out = paddle.stack([x1, x2, x3], axis=0)
             print(out.shape)  # [3, 1, 2]
             print(out)
             # [[[1., 2.]],
             #  [[3., 4.]],
             #  [[5., 6.]]]
+	    
+	    out = paddle.stack([x1, x2, x3], axis=-2)
+	    print(out.shape)  # [1, 3, 2]
+	    print(out)
+	    # [[[1., 2.],
+	    #   [3., 4.],
+	    #   [5., 6.]]]
     """
     return layers.stack(x, axis, name)
 
@@ -1567,6 +1578,8 @@ def scatter(x, index, updates, overwrite=True, name=None):
             #  [1., 1.]]
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_scatter(x, index, updates, overwrite)
         return _C_ops.scatter(x, index, updates, 'overwrite', overwrite)
 
     check_variable_and_dtype(
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 9a0139105651b..ced2113733c02 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1274,6 +1274,8 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
 
 
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_addmm( input, x, y, alpha, beta)
         out = _C_ops.addmm(input, x, y, "Alpha", alpha, "Beta", beta)
         return out
 
@@ -1333,7 +1335,7 @@ def renorm(x, p, axis, max_norm):
             raise ValueError("the axis:{} should not be less than -1 * length of input_shape:{}".format(axis,-1 * len(input_shape)))
         axis = axis + len(input_shape)
     if paddle.in_dynamic_mode():
-        out = core.ops.renorm(x, 'p',p, 'axis',axis, 'max_norm', max_norm)
+        out = _C_ops.renorm(x, 'p',p, 'axis',axis, 'max_norm', max_norm)
         return out
 
     inputs = {'X': x}
@@ -3266,6 +3268,8 @@ def atan2(x, y, name=None):
     """
 
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_atan2( x, y)
         return _C_ops.atan2(x, y)
     else:
         check_variable_and_dtype(x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], 'atan2')
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 0ba47d79050ce..fe2e979f9845c 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -17,7 +17,7 @@
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
 from ..fluid import layers
-from ..framework import core
+from ..framework import core, _in_eager_mode
 from paddle.common_ops_import import convert_np_dtype_to_dtype_
 from paddle.common_ops_import import Variable
 from paddle.common_ops_import import VarDesc
@@ -621,6 +621,9 @@ def where(condition, x=None, y=None, name=None):
         broadcast_condition = paddle.cast(broadcast_condition, 'bool')
 
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_where(broadcast_condition, broadcast_x,
+                                            broadcast_y)
         return _C_ops.where(broadcast_condition, broadcast_x, broadcast_y)
     else:
         helper = LayerHelper("where", **locals())
@@ -712,6 +715,8 @@ def index_sample(x, index):
 
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_index_sample(x, index)
         return _C_ops.index_sample(x, index)
 
     helper = LayerHelper("index_sample", **locals())
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index 85672ec7a36e6..f164bbc466f18 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -263,14 +263,7 @@ def to_string(var, prefix='Tensor'):
         data=data)
 
 
-def tensor_to_string(tensor, prefix='Tensor'):
-    indent = len(prefix) + 1
-
-    _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})"
-
-    if not tensor._is_initialized():
-        return "Tensor(Not initialized)"
-
+def _format_dense_tensor(tensor, indent):
     np_tensor = tensor.numpy()
 
     if len(tensor.shape) == 0:
@@ -288,6 +281,26 @@ def tensor_to_string(tensor, prefix='Tensor'):
 
     data = _format_tensor(
         np_tensor, sumary, indent=indent, max_width=max_width, signed=signed)
+    return data
+
+
+def sparse_tensor_to_string(tensor, prefix='Tensor'):
+    indent = len(prefix) + 1
+    _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient}, \n{indent}{data})"
+    if tensor.is_sparse_coo():
+        indices_tensor = tensor.non_zero_indices()
+        elements_tensor = tensor.non_zero_elements()
+        indices_data = _format_dense_tensor(indices_tensor, indent)
+        elements_data = _format_dense_tensor(elements_tensor, indent)
+        data = 'non_zero_indices=' + indices_data + ',\nnon_zero_elements=' + elements_data
+    else:
+        crows_tensor = tensor.non_zero_crows()
+        cols_tensor = tensor.non_zero_cols()
+        elements_tensor = tensor.non_zero_elements()
+        crows_data = _format_dense_tensor(crows_tensor, indent)
+        cols_data = _format_dense_tensor(cols_tensor, indent)
+        elements_data = _format_dense_tensor(elements_tensor, indent)
+        data = 'non_zero_crows=' + crows_data + ',\nnon_zero_cols=' + cols_data + ',\nnon_zero_elements=' + elements_data
 
     return _template.format(
         prefix=prefix,
@@ -297,3 +310,25 @@ def tensor_to_string(tensor, prefix='Tensor'):
         stop_gradient=tensor.stop_gradient,
         indent=' ' * indent,
         data=data)
+
+
+def tensor_to_string(tensor, prefix='Tensor'):
+    indent = len(prefix) + 1
+
+    _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})"
+
+    if not tensor._is_initialized():
+        return "Tensor(Not initialized)"
+
+    if tensor.is_sparse():
+        return sparse_tensor_to_string(tensor, prefix)
+    else:
+        data = _format_dense_tensor(tensor, indent)
+        return _template.format(
+            prefix=prefix,
+            shape=tensor.shape,
+            dtype=tensor.dtype,
+            place=tensor._place_str,
+            stop_gradient=tensor.stop_gradient,
+            indent=' ' * indent,
+            data=data)
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 6c27d465cb12e..33740dccdfc04 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -5,6 +5,7 @@
     func : ElementwiseInferMeta
   kernel :
     func : add
+  # backward : add_grad
 
 - api : cast
   args : (Tensor x, DataType out_dtype)
@@ -141,6 +142,14 @@
   output : Tensor
   invoke : full_like(x, 1, dtype, place)
 
+- api : pool2d
+  args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm)
+  output : Tensor(out)
+  infer_meta :
+    func : PoolInferMeta
+  kernel:
+    func : pool2d
+
 - api : reshape
   args : (Tensor x, ScalarArray shape)
   output : Tensor(out)
@@ -150,6 +159,15 @@
     func : reshape
   inplace : (x -> out)
 
+- api : relu
+  args : (Tensor x)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : relu
+  inplace : (x -> out)
+
 - api : scale
   args : (Tensor x, Scalar scale, float bias, bool bias_after_scale)
   output : Tensor
@@ -158,6 +176,7 @@
     param : [x]
   kernel :
     func : scale, scale_sr
+  inplace : (x -> out)
 
 - api : sign
   args : (Tensor x)
@@ -167,6 +186,14 @@
   kernel :
     func : sign
 
+- api : softmax
+  args : (Tensor x, int axis)
+  output : Tensor
+  infer_meta :
+    func : SoftmaxInferMeta
+  kernel :
+    func : sotfmax
+
 - api : split
   args : (Tensor x, ScalarArray num_or_sections, Scalar axis)
   output : Tensor[]
@@ -194,6 +221,15 @@
   output : Tensor
   invoke : full_like(x, 0, dtype, place)
 
+
+- api : one_hot
+  args : (Tensor x, Scalar num_classes)
+  output : Tensor
+  infer_meta :
+    func : OneHotInferMeta
+  kernel :
+    func : one_hot
+    
 - api : digamma
   args : (Tensor x)
   output : Tensor
@@ -239,3 +275,265 @@
   kernel :
     func : diagonal
   backward : diagonal_grad
+
+
+- api : gumbel_softmax
+  args : (Tensor x, float temperature, bool hard, int axis)
+  output : Tensor
+  infer_meta :
+    func : GumbelSoftmaxInferMeta
+  kernel :
+    func : gumbel_softmax
+  # backward : gumbel_softmax_grad
+
+- api : diag
+  args : (Tensor x, int offset, float padding_value)
+  output : Tensor
+  infer_meta :
+    func : DiagInferMeta
+  kernel :
+    func : diag
+
+# - api : pixel_shuffle
+#   args : (Tensor x, int upscale_factor, const std::string& data_format)
+#   output : Tensor
+#   infer_meta :
+#     func : PixelShuffleInferMeta
+#   kernel :
+#     func : pixel_shuffle
+
+- api : transpose
+  args : (Tensor x, int[] axis)
+  output : Tensor
+  infer_meta :
+    func : TransposeInferMeta
+  kernel :
+    func : transpose
+  backward : transpose_grad
+
+- api : lerp
+  args : (Tensor x, Tensor y, Tensor weight)
+  output : Tensor
+  infer_meta :
+    func : LerpInferMeta
+  kernel :
+    func : lerp
+  # backward : lerp_grad
+
+- api : scatter
+  args : (Tensor x, Tensor index, Tensor updates, bool overwrite)
+  output : Tensor
+  infer_meta :
+    func : ScatterInferMeta
+    dtype : x
+  kernel :
+    func : scatter
+  backward : scatter_grad
+
+
+- api : scatter_nd_add
+  args : (Tensor x, Tensor index, Tensor updates)
+  output : Tensor
+  infer_meta :
+    func : ScatterNdAddInferMeta
+    dtype : x
+  kernel :
+    func : scatter_nd_add
+  backward : scatter_nd_add_grad
+
+
+- api : addmm
+  args : (Tensor input, Tensor x, Tensor y, float alpha, float beta)
+  output : Tensor
+  infer_meta :
+    func : AddmmInferMeta
+  kernel :
+    func : addmm
+  backward : addmm_grad
+
+
+- api : adadelta
+  args : (Tensor param, Tensor grad, Tensor avg_squared_grad, Tensor avg_squared_update, float rho, float epsilon)
+  output : Tensor(param_out), Tensor(moment_out), Tensor(inf_norm_out)
+  infer_meta :
+    func : AdadeltaInferMeta
+  kernel :
+    func : adadelta
+
+- api : adamax
+  args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment, Tensor inf_norm, Tensor beta1_pow, float beta1, float beta2, float epsilon)
+  output : Tensor(param_out), Tensor(avg_squared_grad_out), Tensor(avg_squared_update_out)
+  infer_meta :
+    func : AdamaxInferMeta
+  kernel :
+    func : adamax
+  
+
+
+- api : where
+  args : (Tensor condition, Tensor x, Tensor y)
+  output : Tensor
+  infer_meta :
+    func : WhereInferMeta
+  kernel :
+    func : where
+  backward : where_grad
+
+
+# BilinearTensorProductInferMeta
+
+# BroadcastTensorsInferMeta
+
+- api : less_than
+  args : (Tensor x, Tensor y, int axis = -1)
+  output : Tensor
+  infer_meta :
+    func : CompareInferMeta
+  kernel :
+    func : less_than  
+
+- api : less_equal
+  args : (Tensor x, Tensor y, int axis = -1)
+  output : Tensor
+  infer_meta :
+    func : CompareInferMeta
+  kernel :
+    func : less_equal
+
+- api : greater
+  args : (Tensor x, Tensor y, int axis = -1)
+  output : Tensor
+  infer_meta :
+    func : CompareInferMeta
+  kernel :
+    func : greater
+
+- api : greater_equal
+  args : (Tensor x, Tensor y, int axis = -1)
+  output : Tensor
+  infer_meta :
+    func : CompareInferMeta
+  kernel :
+    func : greater_equal
+
+- api : equal
+  args : (Tensor x, Tensor y, int axis = -1)
+  output : Tensor
+  infer_meta :
+    func : CompareInferMeta
+  kernel :
+    func : equal
+  
+- api : not_equal
+  args : (Tensor x, Tensor y, int axis = -1)
+  output : Tensor
+  infer_meta :
+    func : CompareInferMeta
+  kernel :
+    func : not_equal
+
+# - api : equal_all
+#   args : (Tensor x, Tensor y)
+#   output : Tensor
+#   infer_meta :
+#     func : CompareAllInferMeta
+#   kernel :
+#     func : equal_all
+
+
+- api : huber_loss
+  args : (Tensor input, Tensor label, float delta)
+  output : Tensor(out), Tensor(residual)
+  infer_meta :
+    func : HuberLossInferMeta
+  kernel :
+    func : huber_loss
+  # backward : huber_loss_grad
+
+- api : triangular_solve
+  args : (Tensor x, Tensor y, bool upper, bool tranpose, bool unitriangular)
+  output : Tensor
+  infer_meta :
+    func : TriangularSolveInferMeta
+  kernel :
+    func : triangular_solve
+  # backward : triangular_solve_grad
+
+
+- api : index_sample
+  args : (Tensor x, Tensor index)
+  output : Tensor
+  infer_meta :
+    func : IndexSampleInferMeta
+  kernel :
+    func : index_sample
+    data_type : x
+  backward : index_sample_grad
+
+
+- api : cross
+  args : (Tensor x, Tensor y, int axis = 9)
+  output : Tensor
+  infer_meta :
+    func : CrossInferMeta
+  kernel :
+    func : cross
+  backward : cross_grad
+
+
+- api : atan2
+  args : (Tensor x, Tensor y)
+  output : Tensor
+  infer_meta :
+    func : Atan2InferMeta
+  kernel :
+    func : atan2
+  backward : atan2_grad
+
+
+- api : bce_loss
+  args : (Tensor input, Tensor label)
+  output : Tensor
+  infer_meta :
+    func : BCELossInferMeta
+  kernel :
+    func : bce_loss
+  backward : bce_loss_grad
+
+
+- api : dist
+  args : (Tensor x, Tensor y, float p)
+  output : Tensor
+  infer_meta :
+    func : DistInferMeta
+  kernel :
+    func : dist
+  # backward : dist_grad
+
+
+- api : gather_nd
+  args : (Tensor x, Tensor index)
+  output : Tensor
+  infer_meta :
+    func : GatherNdInferMeta
+  kernel :
+    func : gather_nd
+    data_type : x
+  backward : gather_nd_grad
+
+- api : gather_tree
+  args : (Tensor ids, Tensor parents)
+  output : Tensor
+  infer_meta :
+    func : GatherTreeMeta
+  kernel :
+    func : gather_tree
+
+- api : mv
+  args : (Tensor x, Tensor vec)
+  output : Tensor
+  infer_meta :
+    func : MvInferMeta
+  kernel :
+    func : mv
+  backward : mv_grad
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index fe68548a22a6d..bf3d7b3d19eab 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -696,8 +696,9 @@ def gen_dense_tensor_kernel_code(self, code_indent, inplace_flag=False):
             code_indent)
         outputs_args, kernel_output_names, output_create = self.gene_output(
             self.outputs['types'], 'SetKernelOutput', code_indent, inplace_flag)
+        api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '')
         return f"""
-{code_indent}  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+{code_indent}  const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
 {code_indent}      "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}});
 {code_indent}  VLOG(6) << "{self.api} API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
 {code_indent}  VLOG(6) << "{self.api} API kernel: " << kernel;
@@ -709,7 +710,10 @@ def gen_dense_tensor_kernel_code(self, code_indent, inplace_flag=False):
 
 {code_indent}  using kernel_signature = {kernel_signature};
 {code_indent}  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
-{code_indent}  (*kernel_fn)({kernel_args}, {outputs_args});
+{code_indent}  {{
+{code_indent}    paddle::platform::RecordEvent kernel_record_event(\"{api_func_name} compute\", paddle::platform::TracerEventType::Operator, 1);
+{code_indent}    (*kernel_fn)({kernel_args}, {outputs_args});
+{code_indent}  }}
 
 {code_indent}  return {self.gene_return_code()};"""
 
@@ -719,6 +723,7 @@ def gen_selected_rows_kernel_code(self, code_indent, inplace_flag=False):
         outputs_args, kernel_output_names, output_create = self.gene_output(
             self.outputs['types'], 'SetSelectedRowsKernelOutput', code_indent,
             inplace_flag)
+        api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '')
         return f"""
 {code_indent}  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
 {code_indent}      "{self.kernel['func'][1]}", {{kernel_backend, kernel_layout, kernel_data_type}});
@@ -732,7 +737,10 @@ def gen_selected_rows_kernel_code(self, code_indent, inplace_flag=False):
 
 {code_indent}  using kernel_signature = {kernel_signature};
 {code_indent}  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
-{code_indent}  (*kernel_fn)({kernel_args}, {outputs_args});
+{code_indent}  {{
+{code_indent}    paddle::platform::RecordEvent kernel_record_event(\"{api_func_name} compute\", paddle::platform::TracerEventType::Operator, 1);
+{code_indent}    (*kernel_fn)({kernel_args}, {outputs_args});
+{code_indent}  }}
 
 {code_indent}  return {self.gene_return_code()};"""
 
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 058cc08465ff0..07baa9b51de39 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -147,7 +147,10 @@ def source_include(header_file_path):
 #include "paddle/phi/infermeta/multiary.h"
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/infermeta/unary.h"
+#include "paddle/phi/infermeta/ternary.h"
 #include "paddle/phi/kernels/declarations.h"
+
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 """
 
 
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index c69bbf35b9726..a0bf363ac9bdb 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -25,6 +25,17 @@
   output : Tensor(x_grad)
   invoke : scale(out_grad, scale, bias, bias_after_scale)
 
+
+- backward_api : add_grad
+  forward : add (Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
+  kernel :
+    func : add_grad
+
 - backward_api : digamma_grad
   forward : digamma (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
@@ -90,3 +101,168 @@
 #     func : MatmulTripleGradInferMeta
 #   kernel :
 #     func : matmul_triple_grad
+
+# - backward_api : gumbel_softmax_grad
+#   forward : gumbel_softmax (Tensor x, float temperature, bool hard, int axis) -> Tensor(out)
+#   args : (Tensor out, Tensor out_grad, int axis)
+#   output : Tensor(x_grad)
+#   infer_meta :
+#     func : GumbelSoftmaxGradInferMeta
+#     param : [out, out_grad, axis]
+#   kernel :
+#     func : gumbel_softmax_grad
+  
+
+- backward_api : transpose_grad
+  forward : transpose (Tensor x, int[] axis) -> Tensor(out)
+  args : (Tensor out_grad, int[] axis)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : TransposeGradInferMeta
+    param : [out_grad, axis]
+  kernel :
+    func : transpose_grad
+  
+# - backward_api : lerp_grad
+#   forward : transpose (Tensor x, Tensor y, Tensor weight) -> Tensor(out)
+#   args : (Tensor x, Tensor y, Tensor weight, Tensor out, Tensor out_grad)
+#   output : Tensor(x_grad), Tensor(y_grad)
+#   infer_meta :
+#     func : GeneralBinaryGradInferMeta
+#     param : [x, y]
+#   kernel :
+#     func : lerp_grad
+
+
+- backward_api : scatter_grad
+  forward : scatter (Tensor x, Tensor index, Tensor updates, bool overwrite) -> Tensor(out)
+  args : (Tensor index, Tensor updates, Tensor out_grad, bool overwrite)
+  output : Tensor(x_grad), Tensor(updates_grad)
+  infer_meta :
+    func : ScatterGradInferMeta
+    param : [index, updates, out_grad, overwrite]
+  kernel :
+    func : scatter_grad
+
+- backward_api : scatter_nd_add_grad
+  forward : scatter (Tensor x, Tensor index, Tensor updates) -> Tensor(out)
+  args : (Tensor index, Tensor updates, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(updates_grad)
+  infer_meta :
+    func : ScatterNdAddGradInferMeta
+    param : [index, updates, out_grad]
+  kernel :
+    func : scatter_nd_grad
+
+- backward_api : addmm_grad
+  forward : scatter (Tensor input, Tensor x, Tensor y, float alpha, float beta) -> Tensor(out)
+  args : (Tensor input, Tensor x, Tensor y, Tensor out_grad, float alpha, float beta)
+  output : Tensor(input_grad), Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param : [input, x, y]
+  kernel :
+    func : addmm_grad
+
+- backward_api : where_grad
+  forward : where (Tensor condition, Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor condition, Tensor x, Tensor y, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
+  kernel :
+    func : where_grad
+
+# - backward_api : huber_loss_grad
+#   forward : huber_loss (Tensor input, Tensor label, float delta) -> Tensor(out), Tensor(residual)
+#   args : (Tensor residual, Tensor out_grad, float delta)
+#   output : Tensor(input_grad), Tensor(label_grad)
+#   infer_meta :
+#     func : GeneralBinaryGradInferMeta
+#     param : [x, y]
+#   kernel :
+#     func : where_grad
+
+# - backward_api : triangular_solve_grad
+#   forward : triangular_solve (Tensor x, Tensor y, bool upper, bool tranpose, bool unitriangular) -> Tensor(out)
+#   args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, bool upper, bool tranpose, bool unitriangular)
+#   output : Tensor(x_grad), Tensor(y_grad)
+#   infer_meta :
+#     func : GeneralBinaryGradInferMeta
+#     param : [x, y]
+#   kernel :
+#     func : triangular_solve_grad
+
+- backward_api : index_sample_grad
+  forward : index_sample (Tensor x, Tensor index) -> Tensor(out)
+  args : (Tensor x, Tensor index, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : index_sample_grad
+
+- backward_api : cross_grad
+  forward : cross (Tensor x, Tensor y, int axis = 9) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad, int axis)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
+  kernel :
+    func : cross_grad
+
+- backward_api : atan2_grad
+  forward : cross (Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
+  kernel :
+    func : atan2_grad
+
+- backward_api : bce_loss_grad
+  forward : bce_loss (Tensor input, Tensor label) -> Tensor(out)
+  args : (Tensor input, Tensor label, Tensor out_grad)
+  output : Tensor(input_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [input]
+  kernel :
+    func : bce_loss_grad
+  
+
+# - backward_api : dist_grad
+#   forward : dist (Tensor x, Tensor y, float p) -> Tensor(out)
+#   args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, float p)
+#   output : Tensor(x_grad), Tensor(y_grad)
+#   infer_meta :
+#     func : GeneralBinaryGradInferMeta
+#     param : [x, y]
+#   kernel :
+#     func : dist_grad
+
+
+
+- backward_api : gather_nd_grad
+  forward : gather_nd (Tensor x, Tensor index) -> Tensor(out)
+  args : (Tensor x, Tensor index, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : gather_nd_grad
+  
+- backward_api : mv_grad
+  forward : mv (Tensor x, Tensor vec) -> Tensor(out)
+  args : (Tensor x, Tensor vec, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(vec_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, vec]
+  kernel :
+    func : mv_grad
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index 7417d6bb030da..5506f71f4b671 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -154,6 +154,8 @@ def source_include(header_file_path):
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/infermeta/backward.h"
+
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 """
 
 
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
index b531c2ed9ce51..2d1fe78b55981 100644
--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -1,21 +1,22 @@
-- sparse_api : conv3d
-  args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups)
+- api : conv3d
+  args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
   output : Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
   kernel :
     func : sparse_conv3d
     layout : x
+  backward : conv3d_grad
 
-- sparse_api : to_dense
-  args : (Tensor x, Backend backend)
+- api : to_dense
+  args : (Tensor x)
   output : Tensor(out@DenseTensor)
-  invoke : to_dense_impl(x, backend)
+  invoke : to_dense_impl(x)
 
-- sparse_api : to_sparse_coo
-  args : (Tensor x, Backend backend, int64 sparse_dim)
+- api : to_sparse_coo
+  args : (Tensor x, int64 sparse_dim)
   output : Tensor(out@SparseCooTensor)
-  invoke : to_sparse_coo_impl(x, backend, sparse_dim)
+  invoke : to_sparse_coo_impl(x, sparse_dim)
 
-- sparse_api : to_sparse_csr
-  args : (Tensor x, Backend backend)
+- api : to_sparse_csr
+  args : (Tensor x)
   output : Tensor(out@SparseCsrTensor)
-  invoke : to_sparse_csr_impl(x, backend)
+  invoke : to_sparse_csr_impl(x)
diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py
index 8ba090f8ca86d..b4fc7638622b9 100644
--- a/python/paddle/utils/code_gen/sparse_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_api_gen.py
@@ -24,9 +24,6 @@ class SparseAPI(ForwardAPI):
     def __init__(self, api_item_yaml):
         super(SparseAPI, self).__init__(api_item_yaml)
 
-    def get_api_name(self, api_item_yaml):
-        return api_item_yaml['sparse_api']
-
     def get_api_func_name(self):
         return self.api
 
@@ -191,14 +188,11 @@ def source_include(header_file_path):
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/sparse_api_custom_impl.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/declarations.h"
 """
 
 
 def api_register():
-    return """
-PD_REGISTER_API(Test);
-"""
+    return ""
 
 
 def api_namespace():
diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml
index c71dce502992f..6532f103cbf86 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml
@@ -1,6 +1,6 @@
-- sparse_bw_api : conv3d_grad
-  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
-  args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups)
+- backward_api : conv3d_grad
+  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
   output : Tensor(x_grad@DenseTensor), Tensor(kernel_grad@DenseTensor)
   kernel :
-    func : sparse_conv_grad
+    func : sparse_conv3d_grad
diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
index ff87968f86df8..5dac7c8c48367 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
@@ -25,9 +25,6 @@ class SparseBackwardAPI(SparseAPI, BackwardAPI):
     def __init__(self, bw_api_item_yaml):
         BackwardAPI.__init__(self, bw_api_item_yaml)
 
-    def get_api_name(self, api_item_yaml):
-        return api_item_yaml['sparse_bw_api']
-
     def get_api_func_name(self):
         return self.api
 
@@ -114,14 +111,11 @@ def source_include(header_file_path):
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/sparse_api_custom_impl.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/declarations.h"
 """
 
 
 def api_register():
-    return """
-PD_REGISTER_API(Test);
-"""
+    return ""
 
 
 def api_namespace():
diff --git a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
index 0d018f8e3f64f..1cb3c33da7219 100644
--- a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
+++ b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
@@ -98,6 +98,7 @@ def source_include(header_file_path):
 #include "paddle/phi/infermeta/multiary.h"
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/infermeta/unary.h"
+#include "paddle/phi/infermeta/ternary.h"
 """
 
 
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 853a98a62b504..b0a5d37a535df 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -146,6 +146,9 @@ def custom_write_stub(resource, pyfile):
         import types
         import paddle
         
+        cur_dir = os.path.dirname(os.path.abspath(__file__))
+        so_path = os.path.join(cur_dir, "{resource}")
+        
         def inject_ext_module(module_name, api_names):
             if module_name in sys.modules:
                 return sys.modules[module_name]
@@ -157,9 +160,6 @@ def inject_ext_module(module_name, api_names):
             return new_module
 
         def __bootstrap__():
-            cur_dir = os.path.dirname(os.path.abspath(__file__))
-            so_path = os.path.join(cur_dir, "{resource}")
-
             assert os.path.exists(so_path)
 
             # load custom op shared library with abs path
@@ -169,6 +169,7 @@ def __bootstrap__():
         __bootstrap__()
 
         {custom_api}
+
         """).lstrip()
 
     # Parse registerring op information
@@ -900,7 +901,7 @@ def remove_if_exit(filepath):
     # delete the temp file before exit python process    
     atexit.register(lambda: remove_if_exit(api_file))
 
-    # write into .py file with RWLock
+    # write into .py file with RWLockc
     api_content = [_custom_api_content(op_name) for op_name in op_names]
     with open(api_file, 'w') as f:
         f.write('\n\n'.join(api_content))
@@ -911,13 +912,15 @@ def remove_if_exit(filepath):
 
 
 def _custom_api_content(op_name):
-    params_str, ins_str, attrs_str, outs_str = _get_api_inputs_str(op_name)
-
+    params_str, ins_str, attrs_str, outs_str, in_names, attrs_names = _get_api_inputs_str(
+        op_name)
+    lower_in_names = [p.split("@")[0].lower() for p in in_names]
     API_TEMPLATE = textwrap.dedent("""
-        from paddle.fluid.core import VarBase
-        from paddle.fluid.framework import in_dygraph_mode, _dygraph_tracer
+        import paddle.fluid.core as core
+        from paddle.fluid.core import VarBase, CustomOpKernelContext
+        from paddle.fluid.framework import in_dygraph_mode, _dygraph_tracer, _in_eager_mode
         from paddle.fluid.layer_helper import LayerHelper
-
+        
         def {op_name}({inputs}):
             # prepare inputs and outputs
             ins = {ins}
@@ -928,9 +931,20 @@ def {op_name}({inputs}):
             # The output variable's dtype use default value 'float32',
             # and the actual dtype of output variable will be inferred in runtime.
             if in_dygraph_mode():
-                for out_name in out_names:
-                    outs[out_name] = VarBase()
-                _dygraph_tracer().trace_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs)
+                if _in_eager_mode():
+                    ctx = CustomOpKernelContext()
+                    for i in {in_names}:
+                        ctx.add_inputs(i)
+                    for j in {attr_names}:
+                        ctx.add_attr(j)
+                    for out_name in out_names:
+                        outs[out_name] = core.eager.Tensor()
+                        ctx.add_outputs(outs[out_name])
+                    core.eager._run_custom_op(ctx, "{op_name}", True)
+                else:
+                    for out_name in out_names:
+                        outs[out_name] = VarBase()
+                    _dygraph_tracer().trace_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs)
             else:
                 helper = LayerHelper("{op_name}", **locals())
                 for out_name in out_names:
@@ -949,6 +963,9 @@ def {op_name}({inputs}):
         inputs=params_str,
         ins=ins_str,
         attrs=attrs_str,
+        # "[x, y, z]""
+        in_names="[" + ",".join(lower_in_names) + "]",
+        attr_names="[" + ",".join(attrs_names) + "]",
         out_names=outs_str)
 
     return api_content
@@ -996,7 +1013,7 @@ def _get_api_inputs_str(op_name):
     ])
     # e.g: ['Out', 'Index']
     outs_str = "[%s]" % ','.join(["'{}'".format(name) for name in out_names])
-    return params_str, ins_str, attrs_str, outs_str
+    return params_str, ins_str, attrs_str, outs_str, in_names, attr_names
 
 
 def _write_setup_file(name,
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index 1a3dbd68066a7..9fd200bf0344d 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -327,12 +327,17 @@ class ToTensor(BaseTransform):
             import paddle.vision.transforms as T
             import paddle.vision.transforms.functional as F
 
-            fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
+            fake_img = Image.fromarray((np.random.rand(4, 5, 3) * 255.).astype(np.uint8))
 
             transform = T.ToTensor()
 
             tensor = transform(fake_img)
-
+            
+            print(tensor.shape)
+            # [3, 4, 5]
+    
+            print(tensor.dtype)
+            # paddle.float32
     """
 
     def __init__(self, data_format='CHW', keys=None):
diff --git a/python/setup.py.in b/python/setup.py.in
index 3ce22892b6ee4..0a10e9dcc698d 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -282,6 +282,12 @@ packages=['paddle',
           'paddle.distribution',
           'paddle.distributed.sharding',
           'paddle.distributed.fleet',
+          'paddle.distributed.launch',
+          'paddle.distributed.launch.context',
+          'paddle.distributed.launch.controllers',
+          'paddle.distributed.launch.job',
+          'paddle.distributed.launch.plugins',
+          'paddle.distributed.launch.utils',
           'paddle.distributed.fleet.base',
           'paddle.distributed.fleet.elastic',
           'paddle.distributed.fleet.meta_optimizers',
@@ -300,6 +306,7 @@ packages=['paddle',
           'paddle.distributed.fleet.meta_parallel.parallel_layers',
           'paddle.distributed.auto_parallel',
           'paddle.distributed.auto_parallel.operators',
+          'paddle.distributed.auto_parallel.tuner',
           'paddle.distributed.passes',
           'paddle.framework',
           'paddle.jit',
@@ -505,6 +512,18 @@ if '${WITH_MKLDNN}' == 'ON':
     else:
         package_data['paddle.libs']+=['mkldnn.dll']
 
+if '${WITH_ONNXRUNTIME}' == 'ON':
+    shutil.copy('${ONNXRUNTIME_SHARED_LIB}', libs_path)
+    if os.name == 'nt':
+        shutil.copy('${PADDLE2ONNX_SHARED_LIB}', libs_path)
+        package_data['paddle.libs']+=['paddle2onnx.dll', 'onnxruntime.dll']
+    else:
+        shutil.copy('${PADDLE2ONNX_LIB}', libs_path)
+        if sys.platform == 'darwin':
+            package_data['paddle.libs']+=['libpaddle2onnx.dylib', 'libonnxruntime.1.10.0.dylib']
+        else:
+            package_data['paddle.libs']+=['libpaddle2onnx.so', 'libonnxruntime.so.1.10.0']
+
 if '${WITH_XPU}' == 'ON':
     # only change rpath in Release mode,
     if '${CMAKE_BUILD_TYPE}' == 'Release':
@@ -714,7 +733,7 @@ with redirect_stdout():
         },
         entry_points={
             'console_scripts': [
-                'fleetrun = paddle.distributed.fleet.launch:launch'
+                'fleetrun = paddle.distributed.launch.__main__:launch'
             ]
         },
         classifiers=[
diff --git a/tools/check_added_ut.sh b/tools/check_added_ut.sh
index 2a9fb842862c2..5466a1cdd597b 100644
--- a/tools/check_added_ut.sh
+++ b/tools/check_added_ut.sh
@@ -52,9 +52,10 @@ if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then
 elif [[ "$SYSTEM" == "Windows_NT" ]];then
     bash $PADDLE_ROOT/win_cmake.sh >prec_build.log 2>&1
 fi
-ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/br-ut
+# remove line ended with .exe to get correct deleted_ut list
+ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | sed '/\.exe$/d' | grep 'test' > $PADDLE_ROOT/br-ut
 cd $PADDLE_ROOT/build
-ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/pr-ut
+ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | sed '/\.exe$/d' | grep 'test' > $PADDLE_ROOT/pr-ut
 cd $PADDLE_ROOT
 grep -F -x -v -f br-ut pr-ut > $PADDLE_ROOT/added_ut
 if [[ "$SYSTEM" == 'Linux' ]];then
@@ -66,6 +67,8 @@ rm -rf prec_build
 if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then
     rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh
 elif [[ "$SYSTEM" == "Windows_NT" ]];then
+    # get the deleted ut list in windows, will be used in check_change_of_unittest.sh
+    grep -F -x -v -f pr-ut br-ut > $PADDLE_ROOT/deleted_ut
     rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/win_cmake.sh
 fi
 git checkout -f $CURBRANCH
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 55d2d59c7ece6..9c802a56a7b6e 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -198,7 +198,9 @@ if [ ${HAS_BOOST_GET} ] && [ "${GIT_PR_ID}" != "" ]; then
     check_approval 1 6836917 47554610 22561442
 fi
 
-HAS_LOG_FATAL=`git diff -U0 upstream/$BRANCH $FILTER |grep "^+" |grep -o -m 1 "LOG(FATAL)" || true`
+# infrt needs to temporarily use LOG(FATAL) during the debugging period, and will replace it with standard error format in the future.
+NO_INFRT_FILES=`git diff --name-only upstream/develop | grep -v "tools/\|paddle/infrt/" || true`
+HAS_LOG_FATAL=`git diff -U0 upstream/$BRANCH $NO_INFRT_FILES |grep "^+" |grep -o -m 1 "LOG(FATAL)" || true`
 if [ ${HAS_LOG_FATAL} ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="LOG(FATAL) is not recommended, because it will throw exception without standard stack information, so please use PADDLE_THROW macro here. If you have to use LOG(FATAL) here, please request chenwhql (Recommend), luotao1 or lanxianghit review and approve.\n"
     check_approval 1 6836917 47554610 22561442
diff --git a/tools/infrt/custom_pdop.td b/tools/infrt/custom_pdop.td
index 2139fbd8155bb..ae0316036f185 100644
--- a/tools/infrt/custom_pdop.td
+++ b/tools/infrt/custom_pdop.td
@@ -23,17 +23,7 @@ def PD_FetchOp : PD_Op<"fetch", [Terminator]> {
   let arguments = (ins PD_Tensor :$inputs, StrAttr:$name);
 }
 
-def PD_ReturnOp : PD_Op<"return", [Terminator]> {
-  let summary = "return Op";
-
-  let description = [{
-    Fetch tensor from the graph.
-  }];
-
-  let arguments = (ins Variadic<PD_Tensor>:$inputs);
-}
-
-def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"ReturnOp">]> {
+def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> {
   let summary = "paddle graph Op";
   let description = [{
     Describe a paddle graph or subgraph.
@@ -52,6 +42,6 @@ def PD_ConstantOp : PD_Op<"constant", [NoSideEffect, ConstantLike, DeclareOpInte
   let hasFolder = 1;
 
   let builders = [
-    OpBuilder<(ins "Attribute":$value)>,
+    OpBuilder<(ins "mlir::Attribute":$value)>,
   ];
 }
diff --git a/tools/infrt/fake_models/multi_fc.py b/tools/infrt/fake_models/multi_fc.py
index 0d633cfc60a9b..7149c8d022afd 100644
--- a/tools/infrt/fake_models/multi_fc.py
+++ b/tools/infrt/fake_models/multi_fc.py
@@ -52,4 +52,7 @@
 exe.run(fluid.default_startup_program())
 
 fluid.io.save_inference_model("./multi_fc_model", [a.name], [fc_out], exe)
+fluid.io.save_inference_model("./multi_fc_model", [a.name], [fc_out], exe, None,
+                              "fc.pdmodel", "fc.pdiparams")
+
 print('output name', fc_out.name)
diff --git a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py
index 027dfe4328a55..b0e420da64aa2 100644
--- a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py
+++ b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py
@@ -16,8 +16,6 @@
 from paddle.fluid import core
 from paddle import compat as cpt
 
-ops_having_canonicalization = {"elementwise_add", }
-
 
 # collect original ops: op which has both inference and grid defination
 def get_original_ops():
@@ -186,16 +184,31 @@ def generate_all_ops_inputs_outputs_map(op_descs):
     cpp_style_ops_outputs_map_str = start_ + ops_outputs_str + "\n};"
 
     # 3. Write to header file
-    dst_head_file = "../../paddle/infrt/dialect/pd_ops_info.h"
+    dst_head_file = "../../paddle/infrt/dialect/pd/common/pd_ops_info.h"
     with open(dst_head_file, 'w') as ops_inputs_outputs_head_file:
         ops_inputs_outputs_head_file.write(cpp_style_ops_inputs_map_str)
         ops_inputs_outputs_head_file.write("\n\n")
         ops_inputs_outputs_head_file.write(cpp_style_ops_outputs_map_str)
 
 
+def get_constraint(op_type, op_proto):
+    # 2.3.1 inputs
+    constraint = "NoSideEffect"
+
+    optional_input_num_ = 0
+    for input_ in op_proto[INPUTS]:
+        if op_proto[INPUTS][input_][EXTRA] != True and op_proto[INPUTS][input_][
+                INTERMEDIATE] != True and op_proto[INPUTS][input_][
+                    DISPENSABLE] == True:
+            optional_input_num_ += 1
+    if optional_input_num_ > 1:
+        constraint += ", AttrSizedOperandSegments"
+    return constraint
+
+
 # funtion to generate paddle op dialect file
 def convert_op_proto_into_mlir(op_descs):
-    dst_dialect_file = "../../paddle/infrt/dialect/pd_ops.td"
+    dst_dialect_file = "../../paddle/infrt/dialect/pd/ir/pd_ops.td"
     custom_dialect_file = "custom_pdop.td"
 
     # 1. Head files
@@ -214,7 +227,7 @@ def convert_op_proto_into_mlir(op_descs):
         "include \"mlir/Interfaces/InferTypeOpInterface.td\"",
         "include \"mlir/Interfaces/LoopLikeInterface.td\"",
         "include \"mlir/IR/OpBase.td\"",
-        "include \"paddle/infrt/dialect/pd_op_base.td\"",
+        "include \"paddle/infrt/dialect/pd/ir/pd_op_base.td\"",
         "",
     ]
 
@@ -239,13 +252,14 @@ def convert_op_proto_into_mlir(op_descs):
         if (op_type in skipped_op_list) or (op_type not in original_ops_):
             continue
         automatically_generated_op_dialect.append(op_type)
+        constraint_ = get_constraint(op_type, op_proto)
         # 2.1 OpDef
-        HEAD = 'def PD_{op_type_capitalize}Op : PD_Op<"{op_type}", [NoSideEffect]> {left_brace}\n'.format(
+        HEAD = 'def PD_{op_type_capitalize}Op : PD_Op<"{op_type}", [{constraint}]> {left_brace}\n'.format(
             op_type_capitalize=op_type.capitalize(),
+            constraint=constraint_,
             op_type=op_type,
             left_brace="{")
         SUMMARY = '  let summary = "{} op";\n'.format(op_type)
-        CANONICALIZATION = "let hasCanonicalizer = 1;" if op_type in ops_having_canonicalization else ""
 
         # 2.2 Description
         contents = ""
@@ -259,14 +273,22 @@ def convert_op_proto_into_mlir(op_descs):
         ARGUMENTS = ""
         if (len(op_proto[INPUTS]) > 0 or len(op_proto[ATTRS]) > 0):
             ARGUMENTS = "  let arguments = (ins "
+
             # 2.3.1 inputs
             for input_ in op_proto[INPUTS]:
                 if op_proto[INPUTS][input_][EXTRA] != True and op_proto[INPUTS][
                         input_][INTERMEDIATE] != True:
-                    if op_proto[INPUTS][input_][DUPLICABLE] != "true":
-                        ARGUMENTS = ARGUMENTS + " PD_Tensor:$" + input_ + ","
+                    if op_proto[INPUTS][input_][DISPENSABLE] != True:
+                        if op_proto[INPUTS][input_][DUPLICABLE] != True:
+                            ARGUMENTS = ARGUMENTS + " PD_Tensor:$" + input_ + ","
+                        else:
+                            ARGUMENTS = ARGUMENTS + " PD_Tensor_Array:$" + input_ + ","
                     else:
-                        ARGUMENTS = ARGUMENTS + " PD_Tensor_Array:$" + input_ + ","
+                        if op_proto[INPUTS][input_][DUPLICABLE] != True:
+                            ARGUMENTS = ARGUMENTS + " Optional<PD_Tensor>:$" + input_ + ","
+                        else:
+                            ARGUMENTS = ARGUMENTS + " Optional<PD_Tensor_Array>:$" + input_ + ","
+
             # unsupported:   BLOCK = 8;  BLOCKS = 10;
             attr_mlir_converter = {
                 0: 'SI32Attr',
@@ -335,7 +357,7 @@ def convert_op_proto_into_mlir(op_descs):
             for output_ in op_proto[OUTPUTS]:
                 if op_proto[OUTPUTS][output_][EXTRA] != True and op_proto[
                         OUTPUTS][output_][INTERMEDIATE] != True:
-                    if op_proto[OUTPUTS][output_][DUPLICABLE] != "true":
+                    if op_proto[OUTPUTS][output_][DUPLICABLE] != True:
                         outputs = outputs + "PD_Tensor:${},".format(output_)
                     else:
                         outputs = outputs + "PD_Tensor_Array:${},".format(
@@ -348,7 +370,6 @@ def convert_op_proto_into_mlir(op_descs):
             ops_mlir_file.write(DESCRIPTION)
             ops_mlir_file.write(ARGUMENTS)
             ops_mlir_file.write(RESULTS)
-            ops_mlir_file.write(CANONICALIZATION)
             ops_mlir_file.write("}\n")
 
     print("Skipped ops num: " + str(len(skipped_op_list)))
diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py
index 36561d4e71da8..f632c9a9dba50 100644
--- a/tools/infrt/generate_phi_kernel_dialect.py
+++ b/tools/infrt/generate_phi_kernel_dialect.py
@@ -43,7 +43,8 @@
     "float64": "FLOAT64",
     "complex64": "COMPLEX64",
     "complex128": "COMPLEX128",
-    "bool": "BOOL"
+    "bool": "BOOL",
+    "Undefined": "UNK"
 }
 
 kernel_types_info_file = "./kernels.json"
diff --git a/tools/infrt/get_compat_kernel_signature.py b/tools/infrt/get_compat_kernel_signature.py
index 78d59c2aef10b..0680e87b38b3f 100644
--- a/tools/infrt/get_compat_kernel_signature.py
+++ b/tools/infrt/get_compat_kernel_signature.py
@@ -16,6 +16,8 @@
 import re
 import json
 
+skip_list = []
+
 
 def parse_compat_registry(kernel_info):
     name, inputs_str, attrs_str, outputs_str = kernel_info.split(",{")
@@ -42,6 +44,8 @@ def get_compat_kernels_info():
             compat_files.remove(file_)
 
     for file_ in compat_files:
+        if file_ in skip_list:
+            continue
         with open("../../paddle/phi/ops/compat/" + file_) as in_file:
             txt = in_file.readlines()
             content = ""
@@ -54,8 +58,9 @@ def get_compat_kernels_info():
                     content += line
                 if (registry and ";" in line):
                     data = content.replace("\n", "").replace(
-                        " ", "").strip("return").strip(
-                            "KernelSignature(").strip("\);").replace("\"", "")
+                        " ",
+                        "").strip("return").strip("KernelSignature(").strip(
+                            "\);").replace("\"", "").replace("\\", "")
                     registry = False
                     name, registry_info = parse_compat_registry(data)
 
diff --git a/tools/infrt/get_phi_kernel_function.sh b/tools/infrt/get_phi_kernel_function.sh
index 3b9f4b7273500..febfe5d04762a 100644
--- a/tools/infrt/get_phi_kernel_function.sh
+++ b/tools/infrt/get_phi_kernel_function.sh
@@ -41,7 +41,37 @@ python3 ${PADDLE_ROOT}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py \
 grep PD_REGISTER_INFER_META_FN ${temp_path}/generate.cc  \
   | awk -F "\(|,|::|\)" '{print $2, $4}' > ${temp_path}/wrap_info.txt
 
-#step 3: merge all infos
+
+#step 3:get ir's attr_name.
+ir_attr_name_info_file=`mktemp`
+# phi_cpu attr
+all_ir_name=`grep -Eo "PDTCPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'`
+for ir in $all_ir_name
+do
+  attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td  | grep -Eo "Attr:.*)" \
+  | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BoolAttr/,""); \
+  gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \
+  gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \
+  gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \
+  gsub(/Attr/,"");gsub(/\)/,""); \
+  gsub(/[,:]/,"");print $a}'`
+  echo phi_cpu.$ir $attr_name >> $ir_attr_name_info_file
+done
+# phi_gpu attr
+all_ir_name=`grep -Eo "PDTGPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'`
+for ir in $all_ir_name
+do
+  attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td  | grep -Eo "Attr:.*)" \
+  | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BoolAttr/,""); \
+  gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \
+  gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \
+  gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \
+  gsub(/Attr/,"");gsub(/\)/,""); \
+  gsub(/[,:]/,"");print $a}'`
+  echo phi_gpu.$ir $attr_name >> $ir_attr_name_info_file
+done
+
+#step 4: merge all infos
 #  @input1 => phi kernel infomation : kernel_name kernel_key(GPU/CPU, precision, layout)
 #  @input2 => information from api.yaml : kernel_name kernel_function_name inferMeta_function_name 
 #  @input3 => information from wrapped_infermeta_gen : ensure the inferMeta function has
@@ -50,4 +80,5 @@ python3 ${PADDLE_ROOT}/tools/infrt/get_phi_kernel_info.py \
   --paddle_root_path ${PADDLE_ROOT} \
   --kernel_info_file $kernel_register_info_file \
   --infermeta_wrap_file ${temp_path}/wrap_info.txt \
+  --attr_info_file $ir_attr_name_info_file \
   --generate_file ${PADDLE_ROOT}/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc
diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py
index 774f6cd6bf364..8b752f928719b 100644
--- a/tools/infrt/get_phi_kernel_info.py
+++ b/tools/infrt/get_phi_kernel_info.py
@@ -37,6 +37,8 @@ def parse_args():
         type=str,
         required=True,
         help="inferMeta wrap info file.")
+    parser.add_argument(
+        "--attr_info_file", type=str, required=True, help="attr info file.")
     parser.add_argument(
         "--generate_file",
         type=str,
@@ -59,6 +61,23 @@ def get_kernel_info(file_path):
     return [l.strip() for l in cont]
 
 
+def get_attr_info(file_path):
+    """
+    phi_gpu.argsort.float64.any $axisBool$descending
+    """
+    ret = {}
+    with open(file_path, 'r') as f:
+        cont = f.readlines()
+        for l in cont:
+            datas = l.strip().split(' ')
+            if len(datas) == 2:
+                attrs = datas[1].split('$')
+                ret[datas[0]] = attrs[1:]
+            else:
+                ret[datas[0]] = None
+    return ret
+
+
 def merge(infer_meta_data, kernel_data, wrap_data):
     meta_map = {}
     for api in infer_meta_data:
@@ -114,14 +133,14 @@ def gen_namespace():
 
 def gen_context(val):
     if val == "CPU":
-        return "phi::CPUContext"
-    # elif val == "GPU":
-    #     return "phi::GPUContext"
+        return "::phi::CPUContext", "phi_cpu"
+    elif val == "GPU":
+        return "::phi::GPUContext", "phi_gpu"
     # elif val == "XPU":
-    #     return "phi::XPUContext"
+    #     return "::phi::XPUContext", "phi_xpu"
     else:
         # raise Exception(f"Unknown context type {val}")
-        return ""
+        return "", ""
 
 
 def gen_layout(val):
@@ -138,12 +157,12 @@ def gen_kernel_func(val, ctx_name, dtype_name):
         ed = val.index('>')
         func_name = val[:st]
         template_name = val[st + 1:ed]
-        if 'phi::' in template_name:
-            return "&phi::" + val
+        if '::phi::' in template_name:
+            return "&::phi::" + val
         else:
-            return "&phi::" + func_name + "<phi::" + template_name + ">"
+            return "&::phi::" + func_name + "<::phi::" + template_name + ">"
     else:
-        return "&phi::" + val + "<" + dtype_name + ", " + ctx_name + ">"
+        return "&::phi::" + val + "<" + dtype_name + ", " + ctx_name + ">"
 
 
 def gen_dtype(vals: List[str]):
@@ -195,34 +214,53 @@ def gen_dtype(vals: List[str]):
     return ir_dtypes, origin_dtypes
 
 
-# TODO(wilber): Now only process CPUContext.
-def gen_register_info(resources: List[List[str]]):
+# Note: Now only process CPUContext and GPUContext.
+
+
+def gen_register_code_info(item: List[str], attr_data: Dict[str, List[str]]):
     """
-    resources: [['add', 'CPU', 'ALL_LAYOUT', 'AddKernel', 'float', 'double', '...'(varaidic types), 'ElementwiseInferMeta'], ...]
+    item: ['add', 'CPU', 'ALL_LAYOUT', 'AddKernel', 'float', 'double', '...'(varaidic types), 'ElementwiseInferMeta']
+    attr_data: {'phi_cpu.arg_min.float32.any': ['axisBool', 'keepdimsBool', 'flatten', 'dtype']}
     """
-    res = "void RegisterInferShapeLaunchers(host_context::KernelRegistry* registry) {"
-    for item in resources:
-        # The output string is polluted by C++ macros, here the \ is removed
-        update_item = [v.strip('\\') for v in item]
+    ctx_name, ir_ctx_name = gen_context(item[1])
+    if (ctx_name == ""):
+        return ""
+    item[2] = gen_layout(item[2])
+    ir_dtypes, origin_dtypes = gen_dtype(item[4:-1])
+    infer_shape_func = "&::phi::" + item[-1]
 
-        ctx_name = gen_context(update_item[1])
-        if (ctx_name == ""):
-            continue
-        update_item[2] = gen_layout(update_item[2])
-        ir_dtypes, origin_dtypes = gen_dtype(update_item[4:-1])
-        infer_shape_func = "&phi::" + update_item[-1]
+    res = ""
 
-        if update_item[-1] == "unknown":
-            # TODO(wilber): handle the unknown inferShape func.
-            continue
+    if item[-1] == "unknown":
+        # TODO(wilber): handle the unknown inferShape func.
+        return ""
+
+    for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes):
+        kernel_func = gen_kernel_func(item[3], ctx_name, origin_dtype)
+        ir_name = ir_ctx_name + '.' + item[0].lower(
+        ) + '.' + ir_dtype + '.' + item[2].lower()
+        if ir_name in attr_data.keys() and attr_data[ir_name] is not None:
+            attr_names = ', '.join(
+                ["\"" + a + "\"" for a in attr_data[ir_name]])
+            res += f"""
+registry->AddKernelWithAttrs("{ir_name}","""
+
+            res += f"""
+    std::bind(&KernelLauncherFunc<decltype({kernel_func}),
+                                  {kernel_func},
+                                  decltype({infer_shape_func}),
+                                  {infer_shape_func}>,
+              KernelLauncher<decltype({kernel_func}),
+                                  {kernel_func},
+                                  decltype({infer_shape_func}),
+                                  {infer_shape_func}>(),
+              std::placeholders::_1),
+    {{{attr_names}}});
+"""
 
-        for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes):
-            kernel_func = gen_kernel_func(update_item[3], ctx_name,
-                                          origin_dtype)
-            ir_name = 'phi_cpu.' + update_item[0].lower(
-            ) + '.' + ir_dtype + '.' + update_item[2].lower()
+        else:
             res += f"""
-  registry->AddKernel("{ir_name}","""
+registry->AddKernel("{ir_name}","""
 
             res += f"""
     std::bind(&KernelLauncherFunc<decltype({kernel_func}),
@@ -236,18 +274,54 @@ def gen_register_info(resources: List[List[str]]):
               std::placeholders::_1));
 """
 
+    return res
+
+
+def gen_register_info(resources: List[List[str]],
+                      attr_data: Dict[str, List[str]]):
+    """
+    resources: [['add', 'CPU', 'ALL_LAYOUT', 'AddKernel', 'float', 'double', '...'(varaidic types), 'ElementwiseInferMeta'], ...]
+    attr_data: {'phi_cpu.arg_min.float32.any': ['axisBool', 'keepdimsBool', 'flatten', 'dtype']}
+    """
+    res = "void RegisterInferShapeLaunchers(host_context::KernelRegistry* registry) {"
+
+    # register cpu kernels.
+    for item in resources:
+        # The output string is polluted by C++ macros, here the \ is removed
+        update_item = [v.strip('\\') for v in item]
+        if update_item[1] != "CPU":
+            continue
+        code = gen_register_code_info(item, attr_data)
+        if (code == ""):
+            continue
+        res += code
+
+    # register gpu kernels.
+    res += "\n#ifdef INFRT_WITH_GPU"
+    for item in resources:
+        # The output string is polluted by C++ macros, here the \ is removed
+        update_item = [v.strip('\\') for v in item]
+        if update_item[1] != "GPU":
+            continue
+        code = gen_register_code_info(item, attr_data)
+        if (code == ""):
+            continue
+        res += code
+    res += "#endif // INFRT_WITH_GPU"
+
     res += "\n}"
     return res
 
 
 def gen_phi_kernel_register_code(resources: List[List[str]],
+                                 attr_data: Dict[str, List[str]],
                                  src_file_path: str):
     source_file = open(src_file_path, 'w')
     source_file.write(gen_warn_info())
     source_file.write(gen_include_headers())
     namespace = gen_namespace()
     source_file.write(namespace[0])
-    source_file.write(gen_register_info(resources))
+    source_file.write(gen_register_info(resources, attr_data))
     source_file.write(namespace[1])
     source_file.close()
 
@@ -257,5 +331,6 @@ def gen_phi_kernel_register_code(resources: List[List[str]],
     infer_meta_data = get_api_yaml_info(args.paddle_root_path)
     kernel_data = get_kernel_info(args.kernel_info_file)
     info_meta_wrap_data = get_kernel_info(args.infermeta_wrap_file)
+    attr_data = get_attr_info(args.attr_info_file)
     out = merge(infer_meta_data, kernel_data, info_meta_wrap_data)
-    gen_phi_kernel_register_code(out, args.generate_file)
+    gen_phi_kernel_register_code(out, attr_data, args.generate_file)
diff --git a/tools/windows/check_change_of_unittest.sh b/tools/windows/check_change_of_unittest.sh
new file mode 100644
index 0000000000000..576f0e5d238ab
--- /dev/null
+++ b/tools/windows/check_change_of_unittest.sh
@@ -0,0 +1,41 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+set +x
+export PADDLE_ROOT="$(cd "$PWD/../" && pwd )"
+GITHUB_API_TOKEN=$GITHUB_API_TOKEN
+GIT_PR_ID=$AGILE_PULL_ID
+BRANCH=$BRANCH
+if [ "${GITHUB_API_TOKEN}" == "" ] || [ "${GIT_PR_ID}" == "" ];then
+    exit 0 
+fi
+
+unittest_spec_diff=$(cat $PADDLE_ROOT/deleted_ut | sed 's/^/ - /g')
+if [ "$unittest_spec_diff" != "" ]; then
+    approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+    APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 22165420 52485244 32428676 45041955`
+    echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
+    if [ "${APPROVALS}" == "FALSE" ]; then
+        echo "************************************"
+        echo -e "It is forbidden to disable or delete the unit-test.\n"
+        echo -e "If you must delete it temporarily, please add it to[https://github.com/PaddlePaddle/Paddle/wiki/Temporarily-disabled-Unit-Test]."
+        echo -e "Then you must have one RD (kolinwei(recommended), chalsliu, XieYunshen or zhouwei25) approval for the deletion of unit-test. \n"
+        echo -e "If you have any problems about deleting unit-test, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/Deleting-unit-test-is-forbidden]. \n"
+        echo -e "Following unit-tests are deleted in this PR: \n${unittest_spec_diff} \n"
+        echo "************************************"
+        exit 6
+    fi
+fi
+set -x